supervertaler 1.9.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

Files changed (85) hide show
  1. Supervertaler.py +47886 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1878 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1172 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.153.dist-info/METADATA +896 -0
  81. supervertaler-1.9.153.dist-info/RECORD +85 -0
  82. supervertaler-1.9.153.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.153.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1401 @@
1
+ """
2
+ Superbench - Qt UI Components
3
+ ==============================
4
+
5
+ PyQt6 user interface for LLM translation benchmarking.
6
+
7
+ Features:
8
+ - Test dataset selection
9
+ - Model selection (checkboxes)
10
+ - Benchmark execution with progress
11
+ - Results table with comparison
12
+ - Summary statistics panel
13
+ - Export functionality
14
+
15
+ Author: Michael Beijer
16
+ License: MIT
17
+ """
18
+
19
+ from PyQt6.QtWidgets import (
20
+ QWidget, QVBoxLayout, QHBoxLayout, QGroupBox, QLabel,
21
+ QPushButton, QComboBox, QCheckBox, QTableWidget, QTableWidgetItem,
22
+ QProgressBar, QTextEdit, QSplitter, QHeaderView, QMessageBox,
23
+ QFileDialog, QRadioButton, QSpinBox
24
+ )
25
+ from PyQt6.QtCore import Qt, QThread, pyqtSignal, QPointF
26
+ from PyQt6.QtGui import QColor, QFont, QPainter, QPen
27
+ from PyQt6.QtWidgets import QStyleOptionButton
28
+ from typing import List, Optional, Dict
29
+ import json
30
+ from pathlib import Path
31
+
32
+ try:
33
+ from modules.llm_leaderboard import (
34
+ LLMLeaderboard, TestDataset, ModelConfig, BenchmarkResult,
35
+ create_sample_datasets, create_dataset_from_project, CHRF_AVAILABLE
36
+ )
37
+ except ImportError:
38
+ from llm_leaderboard import (
39
+ LLMLeaderboard, TestDataset, ModelConfig, BenchmarkResult,
40
+ create_sample_datasets, create_dataset_from_project, CHRF_AVAILABLE
41
+ )
42
+
43
+
44
+ class CheckmarkCheckBox(QCheckBox):
45
+ """Custom checkbox with green background and white checkmark when checked"""
46
+
47
+ def __init__(self, text="", parent=None):
48
+ super().__init__(text, parent)
49
+ self.setCheckable(True)
50
+ self.setEnabled(True)
51
+ self.setStyleSheet("""
52
+ QCheckBox {
53
+ font-size: 9pt;
54
+ spacing: 6px;
55
+ }
56
+ QCheckBox::indicator {
57
+ width: 18px;
58
+ height: 18px;
59
+ border: 2px solid #999;
60
+ border-radius: 3px;
61
+ background-color: white;
62
+ }
63
+ QCheckBox::indicator:checked {
64
+ background-color: #4CAF50;
65
+ border-color: #4CAF50;
66
+ }
67
+ QCheckBox::indicator:hover {
68
+ border-color: #666;
69
+ }
70
+ QCheckBox::indicator:checked:hover {
71
+ background-color: #45a049;
72
+ border-color: #45a049;
73
+ }
74
+ """)
75
+
76
+ def paintEvent(self, event):
77
+ """Override paint event to draw white checkmark when checked"""
78
+ super().paintEvent(event)
79
+
80
+ if self.isChecked():
81
+ opt = QStyleOptionButton()
82
+ self.initStyleOption(opt)
83
+ indicator_rect = self.style().subElementRect(
84
+ self.style().SubElement.SE_CheckBoxIndicator,
85
+ opt,
86
+ self
87
+ )
88
+
89
+ if indicator_rect.isValid():
90
+ # Draw white checkmark
91
+ painter = QPainter(self)
92
+ try:
93
+ painter.setRenderHint(QPainter.RenderHint.Antialiasing)
94
+ pen_width = max(2.0, min(indicator_rect.width(), indicator_rect.height()) * 0.12)
95
+ painter.setPen(QPen(QColor(255, 255, 255), pen_width, Qt.PenStyle.SolidLine, Qt.PenCapStyle.RoundCap, Qt.PenJoinStyle.RoundJoin))
96
+ painter.setBrush(QColor(255, 255, 255))
97
+
98
+ # Draw checkmark (✓ shape)
99
+ x = indicator_rect.x()
100
+ y = indicator_rect.y()
101
+ w = indicator_rect.width()
102
+ h = indicator_rect.height()
103
+
104
+ # Add padding
105
+ padding = min(w, h) * 0.15
106
+ x += padding
107
+ y += padding
108
+ w -= padding * 2
109
+ h -= padding * 2
110
+
111
+ # Checkmark path
112
+ check_x1 = x + w * 0.10
113
+ check_y1 = y + h * 0.50
114
+ check_x2 = x + w * 0.35
115
+ check_y2 = y + h * 0.70
116
+ check_x3 = x + w * 0.90
117
+ check_y3 = y + h * 0.25
118
+
119
+ # Draw checkmark lines
120
+ painter.drawLine(QPointF(check_x2, check_y2), QPointF(check_x3, check_y3))
121
+ painter.drawLine(QPointF(check_x1, check_y1), QPointF(check_x2, check_y2))
122
+ finally:
123
+ painter.end()
124
+
125
+
126
+ class CustomRadioButton(QRadioButton):
127
+ """Custom radio button with square indicator, green when checked, white checkmark"""
128
+
129
+ def __init__(self, text="", parent=None):
130
+ super().__init__(text, parent)
131
+ self.setCheckable(True)
132
+ self.setEnabled(True)
133
+ self.setStyleSheet("""
134
+ QRadioButton {
135
+ font-size: 9pt;
136
+ spacing: 6px;
137
+ }
138
+ QRadioButton::indicator {
139
+ width: 18px;
140
+ height: 18px;
141
+ border: 2px solid #999;
142
+ border-radius: 3px;
143
+ background-color: white;
144
+ }
145
+ QRadioButton::indicator:checked {
146
+ background-color: #4CAF50;
147
+ border-color: #4CAF50;
148
+ }
149
+ QRadioButton::indicator:hover {
150
+ border-color: #666;
151
+ }
152
+ QRadioButton::indicator:checked:hover {
153
+ background-color: #45a049;
154
+ border-color: #45a049;
155
+ }
156
+ """)
157
+
158
+ def paintEvent(self, event):
159
+ """Override paint event to draw white checkmark when checked"""
160
+ super().paintEvent(event)
161
+
162
+ if self.isChecked():
163
+ opt = QStyleOptionButton()
164
+ self.initStyleOption(opt)
165
+ indicator_rect = self.style().subElementRect(
166
+ self.style().SubElement.SE_RadioButtonIndicator,
167
+ opt,
168
+ self
169
+ )
170
+
171
+ if indicator_rect.isValid():
172
+ # Draw white checkmark
173
+ painter = QPainter(self)
174
+ try:
175
+ painter.setRenderHint(QPainter.RenderHint.Antialiasing)
176
+ pen_width = max(2.0, min(indicator_rect.width(), indicator_rect.height()) * 0.12)
177
+ painter.setPen(QPen(QColor(255, 255, 255), pen_width, Qt.PenStyle.SolidLine, Qt.PenCapStyle.RoundCap, Qt.PenJoinStyle.RoundJoin))
178
+ painter.setBrush(QColor(255, 255, 255))
179
+
180
+ # Draw checkmark (✓ shape)
181
+ x = indicator_rect.x()
182
+ y = indicator_rect.y()
183
+ w = indicator_rect.width()
184
+ h = indicator_rect.height()
185
+
186
+ # Add padding
187
+ padding = min(w, h) * 0.15
188
+ x += padding
189
+ y += padding
190
+ w -= padding * 2
191
+ h -= padding * 2
192
+
193
+ # Checkmark path
194
+ check_x1 = x + w * 0.10
195
+ check_y1 = y + h * 0.50
196
+ check_x2 = x + w * 0.35
197
+ check_y2 = y + h * 0.70
198
+ check_x3 = x + w * 0.90
199
+ check_y3 = y + h * 0.25
200
+
201
+ # Draw checkmark lines
202
+ painter.drawLine(QPointF(check_x2, check_y2), QPointF(check_x3, check_y3))
203
+ painter.drawLine(QPointF(check_x1, check_y1), QPointF(check_x2, check_y2))
204
+ finally:
205
+ painter.end()
206
+
207
+
208
+ class BenchmarkThread(QThread):
209
+ """Background thread for running benchmarks without blocking UI"""
210
+
211
+ progress_update = pyqtSignal(int, int, str) # current, total, message
212
+ finished = pyqtSignal() # Completion signal (no data - avoid Qt signal crash with large lists)
213
+ error = pyqtSignal(str) # error message
214
+
215
+ def __init__(self, leaderboard: LLMLeaderboard, dataset: TestDataset, models: List[ModelConfig]):
216
+ super().__init__()
217
+ self.leaderboard = leaderboard
218
+ self.dataset = dataset
219
+ self.models = models
220
+ self.results = [] # Store results here, access from main thread
221
+
222
+ def run(self):
223
+ """Run benchmark in background thread"""
224
+ try:
225
+ print(f"[BENCHMARK THREAD] Starting benchmark with {len(self.models)} models on {len(self.dataset.segments)} segments")
226
+ self.results = self.leaderboard.run_benchmark(
227
+ self.dataset,
228
+ self.models,
229
+ progress_callback=self._on_progress
230
+ )
231
+ print(f"[BENCHMARK THREAD] Benchmark completed with {len(self.results)} results")
232
+ # Don't pass results through signal - causes Qt crash with large lists
233
+ # Main thread will access self.results or self.leaderboard.results directly
234
+ self.finished.emit()
235
+ print(f"[BENCHMARK THREAD] Finished signal emitted successfully")
236
+ except Exception as e:
237
+ print(f"[BENCHMARK THREAD] ERROR: {str(e)}")
238
+ import traceback
239
+ print(f"[BENCHMARK THREAD] TRACEBACK:\n{traceback.format_exc()}")
240
+ self.error.emit(str(e))
241
+
242
+ def _on_progress(self, current: int, total: int, message: str):
243
+ """Forward progress updates to main thread"""
244
+ try:
245
+ self.progress_update.emit(current, total, message)
246
+ except Exception as e:
247
+ print(f"[BENCHMARK THREAD] Progress update failed: {str(e)}")
248
+
249
+
250
+ class LLMLeaderboardUI(QWidget):
251
+ """Main UI widget for Superbench"""
252
+
253
+ def __init__(self, parent=None, llm_client_factory=None):
254
+ super().__init__(parent)
255
+ self.parent_app = parent
256
+ self.llm_client_factory = llm_client_factory
257
+ self.leaderboard = None
258
+ self.benchmark_thread = None
259
+ self.current_results = []
260
+
261
+ # Load sample datasets
262
+ self.datasets = create_sample_datasets()
263
+ self.current_dataset = self.datasets[0] if self.datasets else None
264
+ self.project_dataset = None
265
+ self.project_metadata = None
266
+
267
+ self.init_ui()
268
+
269
+ def init_ui(self):
270
+ """Initialize the user interface"""
271
+ layout = QVBoxLayout()
272
+ layout.setSpacing(5) # Tighter spacing for consistency
273
+ layout.setContentsMargins(10, 10, 10, 10)
274
+
275
+ # Header (matches TMX Editor / AutoFingers / PDF Rescue style)
276
+ header = QLabel("📊 Superbench")
277
+ header.setStyleSheet("font-size: 16pt; font-weight: bold; color: #1976D2;")
278
+ layout.addWidget(header, 0) # 0 = no stretch, stays compact
279
+
280
+ # Description box (matches TMX Editor / AutoFingers / PDF Rescue style)
281
+ description = QLabel(
282
+ "LLM Translation Quality Benchmarking System - A Supervertaler Module.\n"
283
+ "Compare translation quality, speed, and cost across multiple LLM providers."
284
+ )
285
+ description.setWordWrap(True)
286
+ description.setStyleSheet("color: #666; padding: 5px; background-color: #E3F2FD; border-radius: 3px;")
287
+ layout.addWidget(description, 0)
288
+
289
+ # Spacing after description
290
+ layout.addSpacing(10)
291
+
292
+ # Top section: Dataset and Model selection
293
+ top_widget = self._create_top_section()
294
+ layout.addWidget(top_widget)
295
+
296
+ # Progress bar
297
+ self.progress_bar = QProgressBar()
298
+ self.progress_bar.setVisible(False)
299
+ layout.addWidget(self.progress_bar)
300
+
301
+ # Status label
302
+ self.status_label = QLabel("Ready")
303
+ self.status_label.setStyleSheet("color: #666; font-size: 9pt;")
304
+ layout.addWidget(self.status_label)
305
+
306
+ # Splitter for results and log
307
+ splitter = QSplitter(Qt.Orientation.Vertical)
308
+
309
+ # Results table
310
+ self.results_table = self._create_results_table()
311
+ splitter.addWidget(self.results_table)
312
+
313
+ # Summary panel
314
+ self.summary_panel = self._create_summary_panel()
315
+ splitter.addWidget(self.summary_panel)
316
+
317
+ # Log output
318
+ self.log_output = QTextEdit()
319
+ self.log_output.setReadOnly(True)
320
+ self.log_output.setMaximumHeight(150)
321
+ self.log_output.setPlaceholderText("Benchmark log will appear here...")
322
+ splitter.addWidget(self.log_output)
323
+
324
+ splitter.setStretchFactor(0, 3) # Results table gets most space
325
+ splitter.setStretchFactor(1, 1) # Summary panel medium space
326
+ splitter.setStretchFactor(2, 1) # Log output smallest
327
+
328
+ layout.addWidget(splitter)
329
+
330
+ self.setLayout(layout)
331
+
332
+ def _create_top_section(self) -> QWidget:
333
+ """Create dataset selection and model selection section"""
334
+ widget = QWidget()
335
+ layout = QHBoxLayout()
336
+
337
+ # Left: Dataset selection
338
+ dataset_group = QGroupBox("Test Dataset")
339
+ dataset_layout = QVBoxLayout()
340
+
341
+ # Radio buttons for dataset source
342
+ self.predefined_radio = CustomRadioButton("Pre-defined Datasets")
343
+ self.predefined_radio.setChecked(True)
344
+ self.predefined_radio.toggled.connect(self._on_dataset_source_changed)
345
+ dataset_layout.addWidget(self.predefined_radio)
346
+
347
+ # Pre-defined datasets dropdown
348
+ self.dataset_combo = QComboBox()
349
+ for ds in self.datasets:
350
+ self.dataset_combo.addItem(f"{ds.name} ({len(ds.segments)} segments)", ds)
351
+ self.dataset_combo.currentIndexChanged.connect(self._on_dataset_changed)
352
+ dataset_layout.addWidget(self.dataset_combo)
353
+
354
+ dataset_layout.addSpacing(10)
355
+
356
+ # Current Project option
357
+ self.project_radio = CustomRadioButton("Current Project")
358
+ self.project_radio.toggled.connect(self._on_dataset_source_changed)
359
+ dataset_layout.addWidget(self.project_radio)
360
+
361
+ # Project dataset controls (initially hidden)
362
+ self.project_controls_widget = QWidget()
363
+ project_controls_layout = QVBoxLayout()
364
+ project_controls_layout.setContentsMargins(20, 0, 0, 0) # Indent
365
+
366
+ # Sample size
367
+ sample_size_layout = QHBoxLayout()
368
+ sample_size_layout.addWidget(QLabel("Sample size:"))
369
+ self.sample_size_spin = QSpinBox()
370
+ self.sample_size_spin.setRange(1, 50)
371
+ self.sample_size_spin.setValue(10)
372
+ self.sample_size_spin.setToolTip("Number of segments to sample from project")
373
+ sample_size_layout.addWidget(self.sample_size_spin)
374
+ sample_size_layout.addStretch()
375
+ project_controls_layout.addLayout(sample_size_layout)
376
+
377
+ # Sampling method
378
+ method_layout = QHBoxLayout()
379
+ method_layout.addWidget(QLabel("Method:"))
380
+ self.sampling_method_combo = QComboBox()
381
+ self.sampling_method_combo.addItems(["Smart Sampling", "Random", "Evenly Spaced"])
382
+ self.sampling_method_combo.setToolTip(
383
+ "Smart: 30% begin, 40% middle, 30% end\n"
384
+ "Random: Random selection\n"
385
+ "Evenly Spaced: Every Nth segment"
386
+ )
387
+ method_layout.addWidget(self.sampling_method_combo)
388
+ method_layout.addStretch()
389
+ project_controls_layout.addLayout(method_layout)
390
+
391
+ # Project status info
392
+ self.project_status_label = QLabel("Project status: No project loaded")
393
+ self.project_status_label.setStyleSheet("color: #666; font-size: 9pt; padding: 5px;")
394
+ self.project_status_label.setWordWrap(True)
395
+ project_controls_layout.addWidget(self.project_status_label)
396
+
397
+ # Create dataset button
398
+ self.create_dataset_button = QPushButton("📊 Create Test Dataset from Project")
399
+ self.create_dataset_button.clicked.connect(self._on_create_project_dataset)
400
+ self.create_dataset_button.setEnabled(False)
401
+ project_controls_layout.addWidget(self.create_dataset_button)
402
+
403
+ self.project_controls_widget.setLayout(project_controls_layout)
404
+ self.project_controls_widget.setVisible(False)
405
+ dataset_layout.addWidget(self.project_controls_widget)
406
+
407
+ dataset_layout.addStretch()
408
+ dataset_group.setLayout(dataset_layout)
409
+ layout.addWidget(dataset_group)
410
+
411
+ # Update project status on init
412
+ self._update_project_status()
413
+
414
+ # Right: Model selection
415
+ model_group = QGroupBox("Model Selection")
416
+ model_layout = QVBoxLayout()
417
+
418
+ model_layout.addWidget(QLabel("Select models to test:"))
419
+
420
+ # OpenAI models
421
+ self.openai_checkbox = CheckmarkCheckBox("OpenAI (GPT-4o)")
422
+ self.openai_checkbox.setChecked(True)
423
+ model_layout.addWidget(self.openai_checkbox)
424
+
425
+ self.openai_model_combo = QComboBox()
426
+ self.openai_model_combo.addItems([
427
+ "gpt-4o",
428
+ "gpt-4o-mini",
429
+ "gpt-5"
430
+ ])
431
+ self.openai_model_combo.setEnabled(True)
432
+ model_layout.addWidget(self.openai_model_combo)
433
+
434
+ # Claude models
435
+ self.claude_checkbox = CheckmarkCheckBox("Claude (Sonnet 4.5)")
436
+ self.claude_checkbox.setChecked(True)
437
+ model_layout.addWidget(self.claude_checkbox)
438
+
439
+ self.claude_model_combo = QComboBox()
440
+ self.claude_model_combo.addItems([
441
+ "claude-sonnet-4-5-20250929",
442
+ "claude-haiku-4-5-20251001",
443
+ "claude-opus-4-1-20250805"
444
+ ])
445
+ self.claude_model_combo.setEnabled(True)
446
+ model_layout.addWidget(self.claude_model_combo)
447
+
448
+ # Gemini models
449
+ self.gemini_checkbox = CheckmarkCheckBox("Gemini (2.5 Flash)")
450
+ self.gemini_checkbox.setChecked(True)
451
+ model_layout.addWidget(self.gemini_checkbox)
452
+
453
+ self.gemini_model_combo = QComboBox()
454
+ self.gemini_model_combo.addItems([
455
+ "gemini-2.5-flash",
456
+ "gemini-2.5-flash-lite",
457
+ "gemini-2.5-pro"
458
+ ])
459
+ self.gemini_model_combo.setEnabled(True)
460
+ model_layout.addWidget(self.gemini_model_combo)
461
+
462
+ model_layout.addStretch()
463
+
464
+ # Run button
465
+ self.run_button = QPushButton("🚀 Run Benchmark")
466
+ self.run_button.setStyleSheet("font-weight: bold; padding: 8px;")
467
+ self.run_button.clicked.connect(self._on_run_benchmark)
468
+ model_layout.addWidget(self.run_button)
469
+
470
+ # Cancel button
471
+ self.cancel_button = QPushButton("Cancel")
472
+ self.cancel_button.setEnabled(False)
473
+ self.cancel_button.clicked.connect(self._on_cancel_benchmark)
474
+ model_layout.addWidget(self.cancel_button)
475
+
476
+ # Export button
477
+ self.export_button = QPushButton("📊 Export Results...")
478
+ self.export_button.setEnabled(False)
479
+ self.export_button.clicked.connect(self._on_export_results)
480
+ model_layout.addWidget(self.export_button)
481
+
482
+ model_group.setLayout(model_layout)
483
+ layout.addWidget(model_group)
484
+
485
+ widget.setLayout(layout)
486
+ return widget
487
+
488
+ def _create_results_table(self) -> QTableWidget:
489
+ """Create results comparison table"""
490
+ table = QTableWidget()
491
+ table.setColumnCount(6)
492
+ table.setHorizontalHeaderLabels([
493
+ "Segment", "Source Text", "Model", "Translation", "Speed (ms)", "Quality"
494
+ ])
495
+
496
+ # Set column widths
497
+ header = table.horizontalHeader()
498
+ header.setSectionResizeMode(0, QHeaderView.ResizeMode.ResizeToContents)
499
+ header.setSectionResizeMode(1, QHeaderView.ResizeMode.Stretch)
500
+ header.setSectionResizeMode(2, QHeaderView.ResizeMode.ResizeToContents)
501
+ header.setSectionResizeMode(3, QHeaderView.ResizeMode.Stretch)
502
+ header.setSectionResizeMode(4, QHeaderView.ResizeMode.ResizeToContents)
503
+ header.setSectionResizeMode(5, QHeaderView.ResizeMode.ResizeToContents)
504
+
505
+ table.setAlternatingRowColors(True)
506
+ table.setEditTriggers(QTableWidget.EditTrigger.NoEditTriggers)
507
+ table.setSelectionBehavior(QTableWidget.SelectionBehavior.SelectRows)
508
+
509
+ return table
510
+
511
+ def _create_summary_panel(self) -> QWidget:
512
+ """Create summary statistics panel"""
513
+ widget = QGroupBox("Summary Statistics")
514
+ layout = QVBoxLayout()
515
+
516
+ self.summary_table = QTableWidget()
517
+ self.summary_table.setColumnCount(5)
518
+ self.summary_table.setHorizontalHeaderLabels([
519
+ "Model", "Avg Speed (ms)", "Avg Quality", "Success", "Errors"
520
+ ])
521
+
522
+ header = self.summary_table.horizontalHeader()
523
+ header.setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
524
+
525
+ self.summary_table.setMaximumHeight(200)
526
+ self.summary_table.setAlternatingRowColors(True)
527
+ self.summary_table.setEditTriggers(QTableWidget.EditTrigger.NoEditTriggers)
528
+
529
+ layout.addWidget(self.summary_table)
530
+ widget.setLayout(layout)
531
+
532
+ return widget
533
+
534
+ def _on_dataset_changed(self, index: int):
535
+ """Handle dataset selection change"""
536
+ self.current_dataset = self.dataset_combo.itemData(index)
537
+ self.log(f"Selected dataset: {self.current_dataset.name}")
538
+
539
+ def _on_dataset_source_changed(self):
540
+ """Handle radio button toggle between predefined and project datasets"""
541
+ print(f"[LLM DEBUG] _on_dataset_source_changed() called")
542
+ is_project = self.project_radio.isChecked()
543
+ print(f"[LLM DEBUG] is_project: {is_project}")
544
+ self.dataset_combo.setEnabled(not is_project)
545
+ self.project_controls_widget.setVisible(is_project)
546
+
547
+ if is_project:
548
+ # Update project status when switching to project mode
549
+ print(f"[LLM DEBUG] Calling _update_project_status() from _on_dataset_source_changed")
550
+ self._update_project_status()
551
+
552
+ # Switch to project dataset if available
553
+ if self.project_dataset:
554
+ self.current_dataset = self.project_dataset
555
+ self.log(f"Using project dataset: {self.current_dataset.name}")
556
+ else:
557
+ self.current_dataset = None
558
+ else:
559
+ # Switch back to predefined dataset
560
+ self.current_dataset = self.dataset_combo.currentData()
561
+ if self.current_dataset:
562
+ self.log(f"Using predefined dataset: {self.current_dataset.name}")
563
+
564
+ def _update_project_status(self):
565
+ """Update the project status label based on loaded project"""
566
+ try:
567
+ # Debug to console (always print)
568
+ print(f"[LLM DEBUG] _update_project_status() called")
569
+ print(f"[LLM DEBUG] parent_app exists: {self.parent_app is not None}")
570
+ except Exception as e:
571
+ print(f"[LLM DEBUG] ERROR in _update_project_status: {e}")
572
+ import traceback
573
+ traceback.print_exc()
574
+
575
+ if self.parent_app:
576
+ print(f"[LLM DEBUG] parent_app has 'current_project': {hasattr(self.parent_app, 'current_project')}")
577
+ if hasattr(self.parent_app, 'current_project'):
578
+ print(f"[LLM DEBUG] current_project is not None: {self.parent_app.current_project is not None}")
579
+ if self.parent_app.current_project:
580
+ print(f"[LLM DEBUG] project.segments exists: {hasattr(self.parent_app.current_project, 'segments')}")
581
+ if hasattr(self.parent_app.current_project, 'segments'):
582
+ print(f"[LLM DEBUG] project.segments length: {len(self.parent_app.current_project.segments) if self.parent_app.current_project.segments else 0}")
583
+
584
+ if not self.parent_app or not hasattr(self.parent_app, 'current_project') or not self.parent_app.current_project:
585
+ self.project_status_label.setText("⚠️ No project loaded")
586
+ self.project_status_label.setStyleSheet("color: #FF6600; font-size: 9pt; padding: 5px;")
587
+ self.create_dataset_button.setEnabled(False)
588
+ return
589
+
590
+ project = self.parent_app.current_project
591
+ total_segs = len(project.segments) if project.segments else 0
592
+ print(f"[LLM DEBUG] Project has {total_segs} segments")
593
+
594
+ if total_segs == 0:
595
+ self.project_status_label.setText("⚠️ Project has no segments")
596
+ self.project_status_label.setStyleSheet("color: #FF6600; font-size: 9pt; padding: 5px;")
597
+ self.create_dataset_button.setEnabled(False)
598
+ return
599
+
600
+ # Count translated segments
601
+ translated = sum(1 for seg in project.segments if seg.target and seg.target.strip())
602
+ pct = (translated / total_segs * 100) if total_segs > 0 else 0
603
+
604
+ status_html = f"""
605
+ <b>Project Status:</b><br>
606
+ • Total segments: {total_segs}<br>
607
+ • Translated: {translated} ({pct:.1f}%)<br>
608
+ """
609
+
610
+ if translated == 0:
611
+ status_html += "<br>⚠️ <b>No translations yet</b><br>"
612
+ status_html += "Quality scoring unavailable<br>"
613
+ status_html += "Will compare: Speed, Cost, Outputs"
614
+ self.project_status_label.setStyleSheet("color: #FF6600; font-size: 9pt; padding: 5px; background: #FFF8E1; border-radius: 3px;")
615
+ elif translated < total_segs:
616
+ status_html += f"<br>✓ Quality scoring available for {translated} segments"
617
+ self.project_status_label.setStyleSheet("color: #0066CC; font-size: 9pt; padding: 5px; background: #E3F2FD; border-radius: 3px;")
618
+ else:
619
+ status_html += "<br>✓ Quality scoring available (fully translated)"
620
+ self.project_status_label.setStyleSheet("color: #00AA00; font-size: 9pt; padding: 5px; background: #E8F5E9; border-radius: 3px;")
621
+
622
+ self.project_status_label.setText(status_html)
623
+ self.create_dataset_button.setEnabled(True)
624
+
625
+ def _on_create_project_dataset(self):
626
+ """Create test dataset from current project"""
627
+ if not self.parent_app or not hasattr(self.parent_app, 'current_project') or not self.parent_app.current_project:
628
+ QMessageBox.warning(self, "Error", "No project loaded")
629
+ return
630
+
631
+ project = self.parent_app.current_project
632
+ sample_size = self.sample_size_spin.value()
633
+
634
+ # Map combo box text to method name
635
+ method_map = {
636
+ "Smart Sampling": "smart",
637
+ "Random": "random",
638
+ "Evenly Spaced": "evenly_spaced"
639
+ }
640
+ sampling_method = method_map.get(self.sampling_method_combo.currentText(), "smart")
641
+
642
+ try:
643
+ # Create dataset
644
+ self.project_dataset, self.project_metadata = create_dataset_from_project(
645
+ project,
646
+ sample_size=sample_size,
647
+ sampling_method=sampling_method,
648
+ require_targets=False
649
+ )
650
+
651
+ self.current_dataset = self.project_dataset
652
+
653
+ # Log creation
654
+ meta = self.project_metadata
655
+ self.log(f"Created project dataset: {self.project_dataset.name}")
656
+ self.log(f" • Sampled {meta['sampled_segments']} segments from {meta['total_segments']} total")
657
+ self.log(f" • Method: {sampling_method}")
658
+ self.log(f" • References available: {meta['segments_with_references']}/{meta['sampled_segments']}")
659
+
660
+ if meta['quality_scoring_available']:
661
+ self.log(f" • ✓ Quality scoring enabled")
662
+ else:
663
+ self.log(f" • ⚠️ Quality scoring disabled (no reference translations)")
664
+
665
+ # Update button text
666
+ self.create_dataset_button.setText(f"✓ Dataset Created ({len(self.project_dataset.segments)} segments)")
667
+ self.create_dataset_button.setStyleSheet("background-color: #4CAF50; color: white; font-weight: bold;")
668
+
669
+ QMessageBox.information(
670
+ self,
671
+ "Dataset Created",
672
+ f"Successfully created test dataset with {meta['sampled_segments']} segments.\n\n"
673
+ f"Quality scoring: {'Enabled' if meta['quality_scoring_available'] else 'Disabled (no references)'}\n"
674
+ f"Ready to benchmark!"
675
+ )
676
+
677
+ except Exception as e:
678
+ QMessageBox.critical(self, "Error", f"Failed to create dataset:\n{str(e)}")
679
+ self.log(f"ERROR creating project dataset: {str(e)}")
680
+
681
+ def _on_run_benchmark(self):
682
+ """Start benchmark execution"""
683
+ if not self.llm_client_factory:
684
+ QMessageBox.warning(self, "Error", "LLM client factory not available")
685
+ return
686
+
687
+ if not self.current_dataset:
688
+ QMessageBox.warning(self, "Error", "No dataset selected")
689
+ return
690
+
691
+ # Get selected models
692
+ models = self._get_selected_models()
693
+ if not models:
694
+ QMessageBox.warning(self, "Error", "Please select at least one model to test")
695
+ return
696
+
697
+ # Confirm if sacrebleu not available
698
+ if not CHRF_AVAILABLE:
699
+ reply = QMessageBox.question(
700
+ self,
701
+ "Quality Scoring Unavailable",
702
+ "sacrebleu library is not installed. Quality scores will not be calculated.\n\n"
703
+ "Continue anyway?",
704
+ QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No
705
+ )
706
+ if reply == QMessageBox.StandardButton.No:
707
+ return
708
+
709
+ # Clear previous results
710
+ self.results_table.setRowCount(0)
711
+ self.summary_table.setRowCount(0)
712
+ self.log_output.clear()
713
+ self.current_results = []
714
+
715
+ # Update UI state
716
+ self.run_button.setEnabled(False)
717
+ self.cancel_button.setEnabled(True)
718
+ self.export_button.setEnabled(False)
719
+ self.progress_bar.setVisible(True)
720
+ self.progress_bar.setValue(0)
721
+ self.status_label.setText("Running benchmark...")
722
+
723
+ # Create leaderboard instance
724
+ self.leaderboard = LLMLeaderboard(self.llm_client_factory, self.log)
725
+
726
+ # Start benchmark in background thread
727
+ self.benchmark_thread = BenchmarkThread(self.leaderboard, self.current_dataset, models)
728
+ self.benchmark_thread.progress_update.connect(self._on_progress_update)
729
+ self.benchmark_thread.finished.connect(self._on_benchmark_finished)
730
+ self.benchmark_thread.error.connect(self._on_benchmark_error)
731
+ self.benchmark_thread.start()
732
+
733
+ def _on_cancel_benchmark(self):
734
+ """Cancel running benchmark"""
735
+ if self.leaderboard:
736
+ self.leaderboard.cancel_benchmark()
737
+ self.log("⚠️ Cancelling benchmark...")
738
+
739
+ def _on_progress_update(self, current: int, total: int, message: str):
740
+ """Update progress bar and status"""
741
+ self.progress_bar.setMaximum(total)
742
+ self.progress_bar.setValue(current)
743
+ self.status_label.setText(f"{message} ({current}/{total})")
744
+
745
+ def _on_benchmark_finished(self):
746
+ """Handle benchmark completion"""
747
+ try:
748
+ # Get results from benchmark thread (stored there, not passed through signal)
749
+ if self.benchmark_thread and hasattr(self.benchmark_thread, 'results'):
750
+ results = self.benchmark_thread.results[:]
751
+ elif self.leaderboard and hasattr(self.leaderboard, 'results'):
752
+ results = self.leaderboard.results[:]
753
+ else:
754
+ results = []
755
+
756
+ print(f"[UI] _on_benchmark_finished called, retrieved {len(results)} results")
757
+
758
+ # Validate results
759
+ if not results:
760
+ print(f"[UI] WARNING: No results to display")
761
+ self.run_button.setEnabled(True)
762
+ self.cancel_button.setEnabled(False)
763
+ self.progress_bar.setVisible(False)
764
+ self.status_label.setText("⚠️ Benchmark complete but no results")
765
+ self.log("⚠️ Benchmark complete but produced no results")
766
+ return
767
+
768
+ # Copy results to avoid threading issues
769
+ self.current_results = results
770
+
771
+ # Update UI state
772
+ print(f"[UI] Updating UI state...")
773
+ self.run_button.setEnabled(True)
774
+ self.cancel_button.setEnabled(False)
775
+ self.export_button.setEnabled(True)
776
+ self.progress_bar.setVisible(False)
777
+ self.status_label.setText(f"✅ Benchmark complete: {len(results)} results")
778
+ print(f"[UI] UI state updated")
779
+
780
+ # Block table signals during population to prevent crashes
781
+ print(f"[UI] Populating results table...")
782
+ self.results_table.blockSignals(True)
783
+ try:
784
+ self._populate_results_table(results)
785
+ print(f"[UI] Results table populated")
786
+ finally:
787
+ self.results_table.blockSignals(False)
788
+
789
+ # Populate summary table
790
+ print(f"[UI] Populating summary table...")
791
+ self.summary_table.blockSignals(True)
792
+ try:
793
+ self._populate_summary_table()
794
+ print(f"[UI] Summary table populated")
795
+ finally:
796
+ self.summary_table.blockSignals(False)
797
+
798
+ self.log("✅ Benchmark finished successfully")
799
+ print(f"[UI] _on_benchmark_finished completed successfully")
800
+ except Exception as e:
801
+ print(f"[UI] ERROR in _on_benchmark_finished: {str(e)}")
802
+ import traceback
803
+ print(f"[UI] TRACEBACK:\n{traceback.format_exc()}")
804
+ self.log(f"❌ Error displaying results: {str(e)}")
805
+ # Re-enable buttons even on error
806
+ self.run_button.setEnabled(True)
807
+ self.cancel_button.setEnabled(False)
808
+ self.progress_bar.setVisible(False)
809
+ self.status_label.setText("❌ Display error")
810
+ QMessageBox.critical(self, "Display Error", f"Benchmark completed but failed to display results:\n\n{str(e)}")
811
+
812
+ def _on_benchmark_error(self, error_msg: str):
813
+ """Handle benchmark error"""
814
+ self.run_button.setEnabled(True)
815
+ self.cancel_button.setEnabled(False)
816
+ self.progress_bar.setVisible(False)
817
+ self.status_label.setText("❌ Benchmark failed")
818
+
819
+ QMessageBox.critical(self, "Benchmark Error", f"An error occurred:\n\n{error_msg}")
820
+ self.log(f"❌ Error: {error_msg}")
821
+
822
+ def _populate_results_table(self, results: List[BenchmarkResult]):
823
+ """Populate results table with benchmark data"""
824
+ try:
825
+ # Validate inputs
826
+ if not results:
827
+ print("[UI] _populate_results_table: No results to populate")
828
+ return
829
+
830
+ if not self.current_dataset or not hasattr(self.current_dataset, 'segments'):
831
+ print("[UI] _populate_results_table: No current dataset")
832
+ return
833
+
834
+ # Clear existing rows
835
+ self.results_table.setRowCount(0)
836
+
837
+ # Group results by segment
838
+ segments_dict = {}
839
+ for result in results:
840
+ if not result or not hasattr(result, 'segment_id'):
841
+ continue
842
+ if result.segment_id not in segments_dict:
843
+ segments_dict[result.segment_id] = []
844
+ segments_dict[result.segment_id].append(result)
845
+
846
+ # Populate table
847
+ row = 0
848
+ for segment_id in sorted(segments_dict.keys()):
849
+ segment_results = segments_dict[segment_id]
850
+
851
+ # Get source text from dataset
852
+ source_text = "(source not found)"
853
+ if self.current_dataset and self.current_dataset.segments:
854
+ for seg in self.current_dataset.segments:
855
+ if hasattr(seg, 'id') and seg.id == segment_id:
856
+ source_text = seg.source if hasattr(seg, 'source') else "(no source)"
857
+ break
858
+
859
+ # Truncate source text for display
860
+ if source_text and len(source_text) > 80:
861
+ source_text = source_text[:77] + "..."
862
+
863
+ for result in segment_results:
864
+ try:
865
+ self.results_table.insertRow(row)
866
+
867
+ # Segment ID
868
+ self.results_table.setItem(row, 0, QTableWidgetItem(str(segment_id)))
869
+
870
+ # Source text
871
+ self.results_table.setItem(row, 1, QTableWidgetItem(source_text))
872
+
873
+ # Model name
874
+ model_name = result.model_name if hasattr(result, 'model_name') else "Unknown"
875
+ self.results_table.setItem(row, 2, QTableWidgetItem(model_name))
876
+
877
+ # Translation output
878
+ output_text = result.output if (hasattr(result, 'output') and result.output) else f"ERROR: {getattr(result, 'error', 'Unknown error')}"
879
+ if len(output_text) > 100:
880
+ output_text = output_text[:97] + "..."
881
+ item = QTableWidgetItem(output_text)
882
+ if hasattr(result, 'error') and result.error:
883
+ item.setForeground(QColor("red"))
884
+ self.results_table.setItem(row, 3, item)
885
+
886
+ # Speed
887
+ latency = result.latency_ms if hasattr(result, 'latency_ms') else 0.0
888
+ speed_item = QTableWidgetItem(f"{latency:.0f}")
889
+ self.results_table.setItem(row, 4, speed_item)
890
+
891
+ # Quality
892
+ if hasattr(result, 'quality_score') and result.quality_score is not None:
893
+ quality_item = QTableWidgetItem(f"{result.quality_score:.1f}")
894
+ self.results_table.setItem(row, 5, quality_item)
895
+ else:
896
+ self.results_table.setItem(row, 5, QTableWidgetItem("—"))
897
+
898
+ row += 1
899
+ except Exception as row_error:
900
+ print(f"[UI] Error populating row {row}: {row_error}")
901
+ continue
902
+
903
+ except Exception as e:
904
+ print(f"[UI] ERROR in _populate_results_table: {str(e)}")
905
+ import traceback
906
+ print(f"[UI] TRACEBACK:\n{traceback.format_exc()}")
907
+ raise
908
+
909
+ def _populate_summary_table(self):
910
+ """Populate summary statistics table"""
911
+ try:
912
+ if not self.leaderboard:
913
+ print("[UI] _populate_summary_table: No leaderboard instance")
914
+ return
915
+
916
+ summary = self.leaderboard.get_summary_stats()
917
+
918
+ if not summary:
919
+ print("[UI] _populate_summary_table: No summary stats available")
920
+ return
921
+
922
+ self.summary_table.setRowCount(len(summary))
923
+ row = 0
924
+
925
+ for model_name, stats in summary.items():
926
+ try:
927
+ # Model name
928
+ self.summary_table.setItem(row, 0, QTableWidgetItem(str(model_name)))
929
+
930
+ # Avg speed
931
+ avg_speed = stats.get("avg_latency_ms", 0.0)
932
+ speed_item = QTableWidgetItem(f"{avg_speed:.0f}")
933
+ self.summary_table.setItem(row, 1, speed_item)
934
+
935
+ # Avg quality
936
+ avg_quality = stats.get("avg_quality_score")
937
+ if avg_quality is not None:
938
+ quality_item = QTableWidgetItem(f"{avg_quality:.1f}")
939
+ self.summary_table.setItem(row, 2, quality_item)
940
+ else:
941
+ self.summary_table.setItem(row, 2, QTableWidgetItem("—"))
942
+
943
+ # Success count
944
+ success_count = stats.get("success_count", 0)
945
+ self.summary_table.setItem(row, 3, QTableWidgetItem(str(success_count)))
946
+
947
+ # Error count
948
+ error_count = stats.get("error_count", 0)
949
+ error_item = QTableWidgetItem(str(error_count))
950
+ if error_count > 0:
951
+ error_item.setForeground(QColor("red"))
952
+ self.summary_table.setItem(row, 4, error_item)
953
+
954
+ row += 1
955
+ except Exception as row_error:
956
+ print(f"[UI] Error populating summary row {row}: {row_error}")
957
+ continue
958
+
959
+ except Exception as e:
960
+ print(f"[UI] ERROR in _populate_summary_table: {str(e)}")
961
+ import traceback
962
+ print(f"[UI] TRACEBACK:\n{traceback.format_exc()}")
963
+ raise
964
+
965
+ def _on_export_results(self):
966
+ """Export results to file (JSON or Excel)"""
967
+ if not self.current_results:
968
+ QMessageBox.warning(self, "No Results", "No benchmark results to export")
969
+ return
970
+
971
+ # Generate filename with dataset info - sanitize for Windows filesystem
972
+ dataset_name = self.current_dataset.name.replace(" ", "_").replace("→", "-")
973
+ # Remove invalid filename characters: < > : " / \ | ? *
974
+ invalid_chars = '<>:"/\\|?*'
975
+ for char in invalid_chars:
976
+ dataset_name = dataset_name.replace(char, "_")
977
+ from datetime import datetime
978
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
979
+ default_filename = f"LLM_Leaderboard_{dataset_name}_{timestamp}.xlsx"
980
+
981
+ # Ask user for file path and format
982
+ filepath, selected_filter = QFileDialog.getSaveFileName(
983
+ self,
984
+ "Export Benchmark Results",
985
+ default_filename,
986
+ "Excel Files (*.xlsx);;JSON Files (*.json);;All Files (*)"
987
+ )
988
+
989
+ if not filepath:
990
+ return
991
+
992
+ try:
993
+ # Determine export format from selected filter or file extension
994
+ if "Excel" in selected_filter or filepath.endswith('.xlsx'):
995
+ self._export_to_excel(filepath)
996
+ else:
997
+ self._export_to_json(filepath)
998
+
999
+ QMessageBox.information(self, "Export Complete", f"Results exported to:\n{filepath}")
1000
+ self.log(f"OK Results exported to {filepath}")
1001
+
1002
+ except Exception as e:
1003
+ QMessageBox.critical(self, "Export Error", f"Failed to export results:\n{str(e)}")
1004
+ self.log(f"ERROR Export error: {e}")
1005
+
1006
+ def _export_to_json(self, filepath: str):
1007
+ """Export results to JSON file"""
1008
+ export_data = self.leaderboard.export_to_dict()
1009
+ with open(filepath, 'w', encoding='utf-8') as f:
1010
+ json.dump(export_data, f, indent=2, ensure_ascii=False)
1011
+
1012
+ def _export_to_excel(self, filepath: str):
1013
+ """Export results to Excel file with title sheet, detailed results, and summary"""
1014
+ from openpyxl import Workbook
1015
+ from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
1016
+ from openpyxl.utils import get_column_letter
1017
+ from datetime import datetime
1018
+
1019
+ wb = Workbook()
1020
+
1021
+ # === TITLE/INFO SHEET ===
1022
+ ws_info = wb.active
1023
+ ws_info.title = "About"
1024
+
1025
+ # Title with emoji (matches UI header style)
1026
+ ws_info['A1'] = "📊 Superbench"
1027
+ ws_info['A1'].font = Font(size=24, bold=True, color="1976D2") # Blue color matching UI
1028
+ ws_info.merge_cells('A1:D1')
1029
+
1030
+ # Subtitle (matches UI description style)
1031
+ ws_info['A2'] = "Translation Quality Benchmarking System"
1032
+ ws_info['A2'].font = Font(size=12, italic=True, color="666666")
1033
+ ws_info.merge_cells('A2:D2')
1034
+
1035
+ # Supervertaler module branding (matches standardized naming)
1036
+ ws_info['A3'] = "A Supervertaler Module"
1037
+ ws_info['A3'].font = Font(size=11, color="0066CC", underline="single")
1038
+ ws_info['A3'].hyperlink = "https://supervertaler.com/"
1039
+ ws_info.merge_cells('A3:D3')
1040
+
1041
+ # Spacing
1042
+ ws_info.row_dimensions[4].height = 20
1043
+
1044
+ # Benchmark Info
1045
+ info_header_font = Font(bold=True, size=11)
1046
+ info_label_font = Font(size=10, color="666666")
1047
+ info_value_font = Font(size=10)
1048
+
1049
+ row = 5
1050
+ ws_info[f'A{row}'] = "BENCHMARK INFORMATION"
1051
+ ws_info[f'A{row}'].font = Font(bold=True, size=12, color="366092")
1052
+ row += 1
1053
+
1054
+ # Dataset info
1055
+ ws_info[f'A{row}'] = "Test Dataset:"
1056
+ ws_info[f'A{row}'].font = info_label_font
1057
+ ws_info[f'B{row}'] = self.current_dataset.name
1058
+ ws_info[f'B{row}'].font = info_value_font
1059
+ row += 1
1060
+
1061
+ ws_info[f'A{row}'] = "Description:"
1062
+ ws_info[f'A{row}'].font = info_label_font
1063
+ ws_info[f'B{row}'] = self.current_dataset.description
1064
+ ws_info[f'B{row}'].font = info_value_font
1065
+ row += 1
1066
+
1067
+ ws_info[f'A{row}'] = "Segments Tested:"
1068
+ ws_info[f'A{row}'].font = info_label_font
1069
+ ws_info[f'B{row}'] = len(self.current_dataset.segments)
1070
+ ws_info[f'B{row}'].font = info_value_font
1071
+ row += 1
1072
+
1073
+ ws_info[f'A{row}'] = "Date & Time:"
1074
+ ws_info[f'A{row}'].font = info_label_font
1075
+ ws_info[f'B{row}'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
1076
+ ws_info[f'B{row}'].font = info_value_font
1077
+ row += 1
1078
+
1079
+ # Models tested
1080
+ ws_info[f'A{row}'] = "Models Tested:"
1081
+ ws_info[f'A{row}'].font = info_label_font
1082
+ models_tested = set(r.model_name for r in self.current_results)
1083
+ ws_info[f'B{row}'] = ", ".join(models_tested)
1084
+ ws_info[f'B{row}'].font = info_value_font
1085
+ row += 2
1086
+
1087
+ # Explanation section
1088
+ ws_info[f'A{row}'] = "UNDERSTANDING THE RESULTS"
1089
+ ws_info[f'A{row}'].font = Font(bold=True, size=12, color="366092")
1090
+ row += 1
1091
+
1092
+ explanations = [
1093
+ ("Quality Score (chrF++):", "Character-level metric measuring translation accuracy. Higher is better (0-100). Scores above 80 indicate excellent quality."),
1094
+ ("Speed (ms):", "Translation time in milliseconds. Lower is better. Typical range: 1000-5000ms per segment."),
1095
+ ("Success Count:", "Number of segments successfully translated without errors."),
1096
+ ("Error Count:", "Number of failed translations. Should be 0 for production use."),
1097
+ ("Color Coding:", "Green highlighting indicates the best performer in each category (quality/speed).")
1098
+ ]
1099
+
1100
+ for label, explanation in explanations:
1101
+ ws_info[f'A{row}'] = label
1102
+ ws_info[f'A{row}'].font = Font(bold=True, size=10)
1103
+ ws_info[f'B{row}'] = explanation
1104
+ ws_info[f'B{row}'].font = Font(size=10)
1105
+ ws_info[f'B{row}'].alignment = Alignment(wrap_text=True)
1106
+ row += 1
1107
+
1108
+ row += 1
1109
+ ws_info[f'A{row}'] = "NAVIGATION"
1110
+ ws_info[f'A{row}'].font = Font(bold=True, size=12, color="366092")
1111
+ row += 1
1112
+
1113
+ navigation_items = [
1114
+ ("Summary Tab:", "View aggregated statistics and compare models side-by-side."),
1115
+ ("Results Tab:", "View detailed translation output for each segment and model."),
1116
+ ]
1117
+
1118
+ for label, description in navigation_items:
1119
+ ws_info[f'A{row}'] = label
1120
+ ws_info[f'A{row}'].font = Font(bold=True, size=10)
1121
+ ws_info[f'B{row}'] = description
1122
+ ws_info[f'B{row}'].font = Font(size=10)
1123
+ row += 1
1124
+
1125
+ # Column widths
1126
+ ws_info.column_dimensions['A'].width = 20
1127
+ ws_info.column_dimensions['B'].width = 80
1128
+ ws_info.column_dimensions['C'].width = 15
1129
+ ws_info.column_dimensions['D'].width = 15
1130
+
1131
+ # === SUMMARY SHEET ===
1132
+ ws_summary = wb.create_sheet("Summary")
1133
+
1134
+ # Header row
1135
+ summary_headers = ["Model", "Provider", "Model ID", "Avg Speed (ms)", "Avg Quality (chrF++)",
1136
+ "Success Count", "Error Count", "Total Tests"]
1137
+ ws_summary.append(summary_headers)
1138
+
1139
+ # Format header row
1140
+ header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
1141
+ header_font = Font(color="FFFFFF", bold=True)
1142
+ for col_num, _ in enumerate(summary_headers, 1):
1143
+ cell = ws_summary.cell(1, col_num)
1144
+ cell.fill = header_fill
1145
+ cell.font = header_font
1146
+ cell.alignment = Alignment(horizontal="center", vertical="center")
1147
+
1148
+ # Get summary statistics
1149
+ summary_stats = self.leaderboard.get_summary_stats()
1150
+
1151
+ # Populate summary data
1152
+ row_num = 2
1153
+ for model_name, stats in summary_stats.items():
1154
+ # Extract provider and model ID from results
1155
+ provider = ""
1156
+ model_id = ""
1157
+ for result in self.current_results:
1158
+ if result.model_name == model_name:
1159
+ provider = result.provider
1160
+ model_id = result.model_id
1161
+ break
1162
+
1163
+ total_tests = stats["success_count"] + stats["error_count"]
1164
+
1165
+ ws_summary.cell(row_num, 1, model_name)
1166
+ ws_summary.cell(row_num, 2, provider)
1167
+ ws_summary.cell(row_num, 3, model_id)
1168
+ ws_summary.cell(row_num, 4, f"{stats['avg_latency_ms']:.0f}" if stats['avg_latency_ms'] else "")
1169
+ ws_summary.cell(row_num, 5, f"{stats['avg_quality_score']:.2f}" if stats['avg_quality_score'] else "")
1170
+ ws_summary.cell(row_num, 6, stats["success_count"])
1171
+ ws_summary.cell(row_num, 7, stats["error_count"])
1172
+ ws_summary.cell(row_num, 8, total_tests)
1173
+
1174
+ # Highlight best quality score
1175
+ if stats['avg_quality_score']:
1176
+ quality_cell = ws_summary.cell(row_num, 5)
1177
+ best_quality = max(s['avg_quality_score'] for s in summary_stats.values() if s['avg_quality_score'])
1178
+ if stats['avg_quality_score'] == best_quality:
1179
+ quality_cell.fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
1180
+ quality_cell.font = Font(bold=True)
1181
+
1182
+ # Highlight best speed
1183
+ if stats['avg_latency_ms']:
1184
+ speed_cell = ws_summary.cell(row_num, 4)
1185
+ best_speed = min(s['avg_latency_ms'] for s in summary_stats.values() if s['avg_latency_ms'])
1186
+ if stats['avg_latency_ms'] == best_speed:
1187
+ speed_cell.fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
1188
+ speed_cell.font = Font(bold=True)
1189
+
1190
+ row_num += 1
1191
+
1192
+ # Auto-size summary columns
1193
+ ws_summary.column_dimensions['A'].width = 25
1194
+ ws_summary.column_dimensions['B'].width = 15
1195
+ ws_summary.column_dimensions['C'].width = 30
1196
+ ws_summary.column_dimensions['D'].width = 18
1197
+ ws_summary.column_dimensions['E'].width = 20
1198
+ ws_summary.column_dimensions['F'].width = 15
1199
+ ws_summary.column_dimensions['G'].width = 15
1200
+ ws_summary.column_dimensions['H'].width = 15
1201
+
1202
+ # === RESULTS SHEET (New Segment-Grouped Format) ===
1203
+ ws_results = wb.create_sheet("Results")
1204
+
1205
+ # Group results by segment
1206
+ segments_dict = {}
1207
+ for result in self.current_results:
1208
+ if result.segment_id not in segments_dict:
1209
+ segments_dict[result.segment_id] = []
1210
+ segments_dict[result.segment_id].append(result)
1211
+
1212
+ # Get list of all models tested
1213
+ all_models = []
1214
+ model_seen = set()
1215
+ for result in self.current_results:
1216
+ if result.model_name not in model_seen:
1217
+ all_models.append(result.model_name)
1218
+ model_seen.add(result.model_name)
1219
+
1220
+ # Styling
1221
+ header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
1222
+ header_font = Font(color="FFFFFF", bold=True)
1223
+ segment_header_fill = PatternFill(start_color="E3F2FD", end_color="E3F2FD", fill_type="solid")
1224
+ segment_header_font = Font(bold=True, size=11)
1225
+ label_font = Font(bold=True, size=10)
1226
+ best_quality_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
1227
+
1228
+ # Model-specific background colors (alternating for visual clarity)
1229
+ model_colors = {
1230
+ "GPT-4o": PatternFill(start_color="FFCCCC", end_color="FFCCCC", fill_type="solid"), # Stronger pink/salmon
1231
+ "Claude Sonnet 4.5": PatternFill(start_color="CCFFCC", end_color="CCFFCC", fill_type="solid"), # Stronger light green
1232
+ "Gemini 2.5 Flash": PatternFill(start_color="CCDDFF", end_color="CCDDFF", fill_type="solid"), # Stronger light blue
1233
+ }
1234
+ # Default colors for other models
1235
+ default_model_colors = [
1236
+ PatternFill(start_color="FFF4E6", end_color="FFF4E6", fill_type="solid"), # Light orange
1237
+ PatternFill(start_color="F3E5F5", end_color="F3E5F5", fill_type="solid"), # Light purple
1238
+ PatternFill(start_color="E8F5E9", end_color="E8F5E9", fill_type="solid"), # Light green-2
1239
+ ]
1240
+
1241
+ row_num = 1
1242
+
1243
+ # Process each segment
1244
+ for segment_id in sorted(segments_dict.keys()):
1245
+ segment_results = segments_dict[segment_id]
1246
+
1247
+ # Get source and reference text from dataset
1248
+ source_text = ""
1249
+ reference_text = ""
1250
+ for seg in self.current_dataset.segments:
1251
+ if seg.id == segment_id:
1252
+ source_text = seg.source
1253
+ reference_text = seg.reference
1254
+ break
1255
+
1256
+ # Segment header row (spans columns A-B)
1257
+ ws_results.merge_cells(f'A{row_num}:B{row_num}')
1258
+ segment_header_cell = ws_results.cell(row_num, 1, f"Segment {segment_id}")
1259
+ segment_header_cell.fill = segment_header_fill
1260
+ segment_header_cell.font = segment_header_font
1261
+ segment_header_cell.alignment = Alignment(horizontal="left", vertical="center")
1262
+ row_num += 1
1263
+
1264
+ # Source row
1265
+ ws_results.cell(row_num, 1, "Source:").font = label_font
1266
+ ws_results.cell(row_num, 2, source_text)
1267
+ ws_results.cell(row_num, 2).alignment = Alignment(wrap_text=True, vertical="top")
1268
+ row_num += 1
1269
+
1270
+ # Reference row (if available)
1271
+ if reference_text:
1272
+ ws_results.cell(row_num, 1, "Reference:").font = label_font
1273
+ ws_results.cell(row_num, 2, reference_text)
1274
+ ws_results.cell(row_num, 2).alignment = Alignment(wrap_text=True, vertical="top")
1275
+ row_num += 1
1276
+
1277
+ # Find best quality score for this segment (if available)
1278
+ best_quality = None
1279
+ if reference_text: # Only if we have references
1280
+ quality_scores = [r.quality_score for r in segment_results if r.quality_score is not None]
1281
+ if quality_scores:
1282
+ best_quality = max(quality_scores)
1283
+
1284
+ # Model output rows
1285
+ for idx, result in enumerate(segment_results):
1286
+ # Model name label
1287
+ model_cell = ws_results.cell(row_num, 1, result.model_name)
1288
+ model_cell.font = label_font
1289
+
1290
+ # Translation output
1291
+ output_cell = ws_results.cell(row_num, 2, result.output if result.output else result.error if result.error else "")
1292
+ output_cell.alignment = Alignment(wrap_text=True, vertical="top")
1293
+
1294
+ # Apply model-specific background color first
1295
+ if result.model_name in model_colors:
1296
+ model_cell.fill = model_colors[result.model_name]
1297
+ output_cell.fill = model_colors[result.model_name]
1298
+ else:
1299
+ # Use alternating default colors for unknown models
1300
+ color_idx = idx % len(default_model_colors)
1301
+ model_cell.fill = default_model_colors[color_idx]
1302
+ output_cell.fill = default_model_colors[color_idx]
1303
+
1304
+ # Override with best quality highlight (green wins over model color)
1305
+ if best_quality and result.quality_score == best_quality:
1306
+ model_cell.fill = best_quality_fill
1307
+ output_cell.fill = best_quality_fill
1308
+
1309
+ # Override with error highlight (red wins over everything)
1310
+ if result.error:
1311
+ error_fill = PatternFill(start_color="FFCCCC", end_color="FFCCCC", fill_type="solid")
1312
+ model_cell.fill = error_fill
1313
+ output_cell.fill = error_fill
1314
+
1315
+ row_num += 1
1316
+
1317
+ # Add blank row between segments
1318
+ row_num += 1
1319
+
1320
+ # Set column widths
1321
+ ws_results.column_dimensions['A'].width = 20 # Model name column
1322
+ ws_results.column_dimensions['B'].width = 80 # Text column (wider for readability)
1323
+
1324
+ # Save workbook
1325
+ wb.save(filepath)
1326
+
1327
+ def _get_selected_models(self) -> List[ModelConfig]:
1328
+ """Get list of selected models from UI"""
1329
+ models = []
1330
+
1331
+ # Map model IDs to friendly display names
1332
+ model_names = {
1333
+ # OpenAI
1334
+ "gpt-4o": "GPT-4o",
1335
+ "gpt-4o-mini": "GPT-4o Mini",
1336
+ "gpt-5": "GPT-5 (Reasoning)",
1337
+
1338
+ # Claude
1339
+ "claude-sonnet-4-5-20250929": "Claude Sonnet 4.5",
1340
+ "claude-haiku-4-5-20251001": "Claude Haiku 4.5",
1341
+ "claude-opus-4-1-20250805": "Claude Opus 4.1",
1342
+
1343
+ # Gemini
1344
+ "gemini-2.5-flash": "Gemini 2.5 Flash",
1345
+ "gemini-2.5-flash-lite": "Gemini 2.5 Flash Lite",
1346
+ "gemini-2.5-pro": "Gemini 2.5 Pro",
1347
+ "gemini-2.0-flash-exp": "Gemini 2.0 Flash (Exp)"
1348
+ }
1349
+
1350
+ if self.openai_checkbox.isChecked():
1351
+ model_id = self.openai_model_combo.currentText()
1352
+ models.append(ModelConfig(
1353
+ name=model_names.get(model_id, model_id),
1354
+ provider="openai",
1355
+ model_id=model_id
1356
+ ))
1357
+
1358
+ if self.claude_checkbox.isChecked():
1359
+ model_id = self.claude_model_combo.currentText()
1360
+ models.append(ModelConfig(
1361
+ name=model_names.get(model_id, model_id),
1362
+ provider="claude",
1363
+ model_id=model_id
1364
+ ))
1365
+
1366
+ if self.gemini_checkbox.isChecked():
1367
+ model_id = self.gemini_model_combo.currentText()
1368
+ models.append(ModelConfig(
1369
+ name=model_names.get(model_id, model_id),
1370
+ provider="gemini",
1371
+ model_id=model_id
1372
+ ))
1373
+
1374
+ return models
1375
+
1376
+ def log(self, message: str):
1377
+ """Append message to log output and auto-scroll to bottom"""
1378
+ self.log_output.append(message)
1379
+ # Auto-scroll to bottom to show latest messages
1380
+ scrollbar = self.log_output.verticalScrollBar()
1381
+ scrollbar.setValue(scrollbar.maximum())
1382
+
1383
+
1384
+ # For standalone testing
1385
+ if __name__ == "__main__":
1386
+ import sys
1387
+ from PyQt6.QtWidgets import QApplication
1388
+
1389
+ app = QApplication(sys.argv)
1390
+
1391
+ # Mock LLM client factory for testing UI
1392
+ def mock_llm_factory(provider, model):
1393
+ print(f"Mock: Creating {provider} client with model {model}")
1394
+ return None
1395
+
1396
+ window = LLMLeaderboardUI(llm_client_factory=mock_llm_factory)
1397
+ window.setWindowTitle("Superbench")
1398
+ window.resize(1200, 800)
1399
+ window.show()
1400
+
1401
+ sys.exit(app.exec())