supervertaler 1.9.163__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- Supervertaler.py +48473 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1911 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +351 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1176 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.163.dist-info/METADATA +906 -0
- supervertaler-1.9.163.dist-info/RECORD +85 -0
- supervertaler-1.9.163.dist-info/WHEEL +5 -0
- supervertaler-1.9.163.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.163.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.163.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,737 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM Leaderboard - Core Benchmarking Module
|
|
3
|
+
===========================================
|
|
4
|
+
|
|
5
|
+
Comprehensive LLM translation benchmarking system for Supervertaler.
|
|
6
|
+
Compare translation quality, speed, and cost across multiple providers.
|
|
7
|
+
|
|
8
|
+
Features:
|
|
9
|
+
- Multi-provider comparison (OpenAI, Claude, Gemini)
|
|
10
|
+
- Quality scoring (chrF++ metric)
|
|
11
|
+
- Speed measurement (latency per segment)
|
|
12
|
+
- Cost estimation (token-based)
|
|
13
|
+
- Test dataset management
|
|
14
|
+
- Results export (Excel/CSV)
|
|
15
|
+
|
|
16
|
+
Author: Michael Beijer
|
|
17
|
+
License: MIT
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import time
|
|
21
|
+
import json
|
|
22
|
+
import random
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Dict, List, Optional, Tuple
|
|
25
|
+
from dataclasses import dataclass, asdict
|
|
26
|
+
import threading
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
from sacrebleu.metrics import CHRF
|
|
30
|
+
CHRF_AVAILABLE = True
|
|
31
|
+
except ImportError:
|
|
32
|
+
CHRF_AVAILABLE = False
|
|
33
|
+
print("Warning: sacrebleu not installed. Quality scoring will be disabled.")
|
|
34
|
+
print("Install with: pip install sacrebleu")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class TestSegment:
|
|
39
|
+
"""Single test segment with source and reference translation"""
|
|
40
|
+
id: int
|
|
41
|
+
source: str
|
|
42
|
+
reference: str
|
|
43
|
+
domain: str = "general"
|
|
44
|
+
direction: str = "EN→NL"
|
|
45
|
+
context: str = ""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class BenchmarkResult:
|
|
50
|
+
"""Result of translating a single segment with one model"""
|
|
51
|
+
segment_id: int
|
|
52
|
+
model_name: str
|
|
53
|
+
provider: str
|
|
54
|
+
model_id: str
|
|
55
|
+
output: str
|
|
56
|
+
latency_ms: float
|
|
57
|
+
quality_score: Optional[float] = None
|
|
58
|
+
error: Optional[str] = None
|
|
59
|
+
tokens_input: Optional[int] = None
|
|
60
|
+
tokens_output: Optional[int] = None
|
|
61
|
+
cost_estimate: Optional[float] = None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class ModelConfig:
|
|
66
|
+
"""Configuration for a single model to test"""
|
|
67
|
+
name: str # Display name (e.g., "GPT-4o")
|
|
68
|
+
provider: str # "openai", "claude", "gemini"
|
|
69
|
+
model_id: str # Actual model ID for API
|
|
70
|
+
enabled: bool = True
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class TestDataset:
|
|
74
|
+
"""Manages test datasets for benchmarking"""
|
|
75
|
+
|
|
76
|
+
def __init__(self, name: str, description: str = ""):
|
|
77
|
+
self.name = name
|
|
78
|
+
self.description = description
|
|
79
|
+
self.segments: List[TestSegment] = []
|
|
80
|
+
|
|
81
|
+
def add_segment(self, segment: TestSegment):
|
|
82
|
+
"""Add a test segment to the dataset"""
|
|
83
|
+
self.segments.append(segment)
|
|
84
|
+
|
|
85
|
+
def to_dict(self) -> Dict:
|
|
86
|
+
"""Convert dataset to dictionary for JSON export"""
|
|
87
|
+
return {
|
|
88
|
+
"name": self.name,
|
|
89
|
+
"description": self.description,
|
|
90
|
+
"segments": [asdict(seg) for seg in self.segments]
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def from_dict(cls, data: Dict) -> 'TestDataset':
|
|
95
|
+
"""Load dataset from dictionary"""
|
|
96
|
+
dataset = cls(data["name"], data.get("description", ""))
|
|
97
|
+
for seg_data in data.get("segments", []):
|
|
98
|
+
dataset.add_segment(TestSegment(**seg_data))
|
|
99
|
+
return dataset
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def from_json_file(cls, filepath: Path) -> 'TestDataset':
|
|
103
|
+
"""Load dataset from JSON file"""
|
|
104
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
|
105
|
+
data = json.load(f)
|
|
106
|
+
return cls.from_dict(data)
|
|
107
|
+
|
|
108
|
+
def save_to_json(self, filepath: Path):
|
|
109
|
+
"""Save dataset to JSON file"""
|
|
110
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
|
111
|
+
json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class LLMLeaderboard:
|
|
115
|
+
"""Main benchmarking engine for LLM translation comparison"""
|
|
116
|
+
|
|
117
|
+
def __init__(self, llm_client_factory, log_callback=None):
|
|
118
|
+
"""
|
|
119
|
+
Initialize LLM Leaderboard
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
llm_client_factory: Function that creates LLMClient instances
|
|
123
|
+
Signature: (provider: str, model: str) -> LLMClient
|
|
124
|
+
log_callback: Optional callback for logging messages
|
|
125
|
+
"""
|
|
126
|
+
self.llm_client_factory = llm_client_factory
|
|
127
|
+
self.log = log_callback if log_callback else print
|
|
128
|
+
self.chrf_metric = CHRF(word_order=2) if CHRF_AVAILABLE else None
|
|
129
|
+
self.results: List[BenchmarkResult] = []
|
|
130
|
+
self.is_running = False
|
|
131
|
+
self.cancel_requested = False
|
|
132
|
+
|
|
133
|
+
def _lang_code_to_name(self, code: str) -> str:
|
|
134
|
+
"""Convert language code to full language name for LLM prompts"""
|
|
135
|
+
# Common language codes to names mapping
|
|
136
|
+
lang_map = {
|
|
137
|
+
"en": "English", "en-us": "English", "en-gb": "English",
|
|
138
|
+
"nl": "Dutch", "nl-nl": "Dutch", "nl-be": "Dutch (Belgian)",
|
|
139
|
+
"de": "German", "de-de": "German", "de-at": "German (Austrian)",
|
|
140
|
+
"fr": "French", "fr-fr": "French", "fr-be": "French (Belgian)",
|
|
141
|
+
"es": "Spanish", "es-es": "Spanish", "es-mx": "Spanish (Mexican)",
|
|
142
|
+
"it": "Italian", "it-it": "Italian",
|
|
143
|
+
"pt": "Portuguese", "pt-pt": "Portuguese", "pt-br": "Portuguese (Brazilian)",
|
|
144
|
+
"ru": "Russian", "ru-ru": "Russian",
|
|
145
|
+
"zh": "Chinese", "zh-cn": "Chinese (Simplified)", "zh-tw": "Chinese (Traditional)",
|
|
146
|
+
"ja": "Japanese", "ja-jp": "Japanese",
|
|
147
|
+
"ko": "Korean", "ko-kr": "Korean",
|
|
148
|
+
"ar": "Arabic", "ar-sa": "Arabic",
|
|
149
|
+
"pl": "Polish", "pl-pl": "Polish",
|
|
150
|
+
"sv": "Swedish", "sv-se": "Swedish",
|
|
151
|
+
"da": "Danish", "da-dk": "Danish",
|
|
152
|
+
"no": "Norwegian", "nb-no": "Norwegian",
|
|
153
|
+
"fi": "Finnish", "fi-fi": "Finnish",
|
|
154
|
+
"cs": "Czech", "cs-cz": "Czech",
|
|
155
|
+
"tr": "Turkish", "tr-tr": "Turkish",
|
|
156
|
+
"el": "Greek", "el-gr": "Greek",
|
|
157
|
+
"he": "Hebrew", "he-il": "Hebrew",
|
|
158
|
+
"hi": "Hindi", "hi-in": "Hindi",
|
|
159
|
+
"th": "Thai", "th-th": "Thai",
|
|
160
|
+
"vi": "Vietnamese", "vi-vn": "Vietnamese",
|
|
161
|
+
"id": "Indonesian", "id-id": "Indonesian",
|
|
162
|
+
"ms": "Malay", "ms-my": "Malay",
|
|
163
|
+
"uk": "Ukrainian", "uk-ua": "Ukrainian",
|
|
164
|
+
"ro": "Romanian", "ro-ro": "Romanian",
|
|
165
|
+
"hu": "Hungarian", "hu-hu": "Hungarian",
|
|
166
|
+
"bg": "Bulgarian", "bg-bg": "Bulgarian",
|
|
167
|
+
"hr": "Croatian", "hr-hr": "Croatian",
|
|
168
|
+
"sr": "Serbian", "sr-rs": "Serbian",
|
|
169
|
+
"sk": "Slovak", "sk-sk": "Slovak",
|
|
170
|
+
"sl": "Slovenian", "sl-si": "Slovenian",
|
|
171
|
+
"lt": "Lithuanian", "lt-lt": "Lithuanian",
|
|
172
|
+
"lv": "Latvian", "lv-lv": "Latvian",
|
|
173
|
+
"et": "Estonian", "et-ee": "Estonian",
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
# Normalize code to lowercase
|
|
177
|
+
code_lower = code.lower().strip()
|
|
178
|
+
|
|
179
|
+
# Return mapped name or capitalize the code as fallback
|
|
180
|
+
return lang_map.get(code_lower, code.upper())
|
|
181
|
+
|
|
182
|
+
def build_translation_prompt(self, segment: TestSegment) -> str:
|
|
183
|
+
"""Build translation prompt for a test segment"""
|
|
184
|
+
# Extract language codes from direction (e.g., "EN→NL" or "en→nl")
|
|
185
|
+
parts = segment.direction.split("→")
|
|
186
|
+
source_code = parts[0].strip() if len(parts) > 0 else "source language"
|
|
187
|
+
target_code = parts[1].strip() if len(parts) > 1 else "target language"
|
|
188
|
+
|
|
189
|
+
# Convert codes to full language names
|
|
190
|
+
source_lang = self._lang_code_to_name(source_code)
|
|
191
|
+
target_lang = self._lang_code_to_name(target_code)
|
|
192
|
+
direction_hint = f"from {source_lang} to {target_lang}"
|
|
193
|
+
|
|
194
|
+
prompt = f"""You are a professional translator. Translate the following text {direction_hint}.
|
|
195
|
+
|
|
196
|
+
Domain: {segment.domain}
|
|
197
|
+
Target language: {target_lang}
|
|
198
|
+
Requirements: Be faithful to meaning, natural, and correct. Preserve units, numbers, and formatting. Keep terminology consistent.
|
|
199
|
+
|
|
200
|
+
Text to translate:
|
|
201
|
+
{segment.source}
|
|
202
|
+
|
|
203
|
+
Return ONLY the translation, no explanations or additional text."""
|
|
204
|
+
|
|
205
|
+
if segment.context:
|
|
206
|
+
prompt = f"Context: {segment.context}\n\n{prompt}"
|
|
207
|
+
|
|
208
|
+
return prompt
|
|
209
|
+
|
|
210
|
+
def run_benchmark(
|
|
211
|
+
self,
|
|
212
|
+
dataset: TestDataset,
|
|
213
|
+
models: List[ModelConfig],
|
|
214
|
+
progress_callback=None
|
|
215
|
+
) -> List[BenchmarkResult]:
|
|
216
|
+
"""
|
|
217
|
+
Run benchmark comparing multiple models on a test dataset
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
dataset: TestDataset to run
|
|
221
|
+
models: List of ModelConfig to test
|
|
222
|
+
progress_callback: Optional callback(current, total, message)
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
List of BenchmarkResult objects
|
|
226
|
+
"""
|
|
227
|
+
self.is_running = True
|
|
228
|
+
self.cancel_requested = False
|
|
229
|
+
self.results = []
|
|
230
|
+
|
|
231
|
+
enabled_models = [m for m in models if m.enabled]
|
|
232
|
+
total_tests = len(dataset.segments) * len(enabled_models)
|
|
233
|
+
current_test = 0
|
|
234
|
+
|
|
235
|
+
self.log(f"Starting benchmark: {dataset.name}")
|
|
236
|
+
self.log(f" Models: {', '.join(m.name for m in enabled_models)}")
|
|
237
|
+
self.log(f" Segments: {len(dataset.segments)}")
|
|
238
|
+
self.log(f" Total translations: {total_tests}")
|
|
239
|
+
|
|
240
|
+
for segment in dataset.segments:
|
|
241
|
+
if self.cancel_requested:
|
|
242
|
+
self.log("Warning: Benchmark cancelled by user")
|
|
243
|
+
break
|
|
244
|
+
|
|
245
|
+
prompt = self.build_translation_prompt(segment)
|
|
246
|
+
|
|
247
|
+
for model_config in enabled_models:
|
|
248
|
+
if self.cancel_requested:
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
current_test += 1
|
|
252
|
+
|
|
253
|
+
# Progress update
|
|
254
|
+
if progress_callback:
|
|
255
|
+
progress_callback(
|
|
256
|
+
current_test,
|
|
257
|
+
total_tests,
|
|
258
|
+
f"Testing {model_config.name} on segment {segment.id}"
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Run translation and measure time
|
|
262
|
+
result = self._translate_segment(
|
|
263
|
+
segment,
|
|
264
|
+
model_config,
|
|
265
|
+
prompt
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
self.results.append(result)
|
|
269
|
+
|
|
270
|
+
# Log result
|
|
271
|
+
if result.error:
|
|
272
|
+
self.log(f" ERROR {model_config.name} seg {segment.id}: {result.error}")
|
|
273
|
+
else:
|
|
274
|
+
quality_str = f", chrF++: {result.quality_score:.1f}" if result.quality_score else ""
|
|
275
|
+
self.log(f" OK {model_config.name} seg {segment.id}: {result.latency_ms:.0f}ms{quality_str}")
|
|
276
|
+
|
|
277
|
+
self.is_running = False
|
|
278
|
+
self.log(f"Benchmark complete: {len(self.results)} results")
|
|
279
|
+
|
|
280
|
+
return self.results
|
|
281
|
+
|
|
282
|
+
def _translate_segment(
|
|
283
|
+
self,
|
|
284
|
+
segment: TestSegment,
|
|
285
|
+
model_config: ModelConfig,
|
|
286
|
+
prompt: str
|
|
287
|
+
) -> BenchmarkResult:
|
|
288
|
+
"""Translate a single segment with one model and measure performance"""
|
|
289
|
+
|
|
290
|
+
result = BenchmarkResult(
|
|
291
|
+
segment_id=segment.id,
|
|
292
|
+
model_name=model_config.name,
|
|
293
|
+
provider=model_config.provider,
|
|
294
|
+
model_id=model_config.model_id,
|
|
295
|
+
output="",
|
|
296
|
+
latency_ms=0.0
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
self.log(f" DEBUG: Starting translation for segment {segment.id} with {model_config.name}")
|
|
301
|
+
|
|
302
|
+
# Validate segment data first
|
|
303
|
+
if not segment:
|
|
304
|
+
result.error = "Null segment"
|
|
305
|
+
self.log(f" ERROR: Null segment received")
|
|
306
|
+
return result
|
|
307
|
+
|
|
308
|
+
if not hasattr(segment, 'id') or segment.id is None:
|
|
309
|
+
result.error = "Segment missing ID"
|
|
310
|
+
self.log(f" ERROR: Segment missing ID")
|
|
311
|
+
return result
|
|
312
|
+
|
|
313
|
+
if not hasattr(segment, 'source'):
|
|
314
|
+
result.error = f"Segment {segment.id} missing source attribute"
|
|
315
|
+
self.log(f" ERROR: {result.error}")
|
|
316
|
+
return result
|
|
317
|
+
|
|
318
|
+
# Validate segment source text
|
|
319
|
+
if not segment.source or not segment.source.strip():
|
|
320
|
+
result.error = "Empty source text"
|
|
321
|
+
self.log(f" ERROR: Segment {segment.id} has empty source text")
|
|
322
|
+
return result
|
|
323
|
+
|
|
324
|
+
# Parse language codes from direction
|
|
325
|
+
if not hasattr(segment, 'direction') or not segment.direction:
|
|
326
|
+
result.error = f"Segment {segment.id} missing direction"
|
|
327
|
+
self.log(f" ERROR: {result.error}")
|
|
328
|
+
return result
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
direction_parts = segment.direction.split("→")
|
|
332
|
+
if len(direction_parts) != 2:
|
|
333
|
+
raise ValueError(f"Invalid direction format: {segment.direction}")
|
|
334
|
+
source_lang = direction_parts[0].strip().lower()
|
|
335
|
+
target_lang = direction_parts[1].strip().lower()
|
|
336
|
+
|
|
337
|
+
if not source_lang or not target_lang:
|
|
338
|
+
raise ValueError(f"Empty language codes in direction: {segment.direction}")
|
|
339
|
+
except Exception as lang_err:
|
|
340
|
+
result.error = f"Invalid language direction: {segment.direction} - {str(lang_err)}"
|
|
341
|
+
self.log(f" ERROR: {result.error}")
|
|
342
|
+
return result
|
|
343
|
+
|
|
344
|
+
# Create LLM client with error handling
|
|
345
|
+
try:
|
|
346
|
+
self.log(f" DEBUG: Creating client for {model_config.provider} with model {model_config.model_id}")
|
|
347
|
+
client = self.llm_client_factory(model_config.provider, model_config.model_id)
|
|
348
|
+
if not client:
|
|
349
|
+
result.error = f"Failed to create {model_config.provider} client"
|
|
350
|
+
self.log(f" ERROR: {result.error}")
|
|
351
|
+
return result
|
|
352
|
+
self.log(f" DEBUG: Client created successfully")
|
|
353
|
+
except Exception as client_err:
|
|
354
|
+
result.error = f"Client creation failed: {str(client_err)}"
|
|
355
|
+
self.log(f" ERROR: {result.error}")
|
|
356
|
+
return result
|
|
357
|
+
|
|
358
|
+
# Measure translation time with comprehensive error handling
|
|
359
|
+
try:
|
|
360
|
+
start_time = time.perf_counter()
|
|
361
|
+
source_preview = segment.source[:50] + "..." if len(segment.source) > 50 else segment.source
|
|
362
|
+
self.log(f" DEBUG: Calling translate with text='{source_preview}', source={source_lang}, target={target_lang}")
|
|
363
|
+
|
|
364
|
+
output = client.translate(
|
|
365
|
+
text=segment.source,
|
|
366
|
+
source_lang=source_lang,
|
|
367
|
+
target_lang=target_lang,
|
|
368
|
+
custom_prompt=prompt
|
|
369
|
+
)
|
|
370
|
+
elapsed_time = time.perf_counter() - start_time
|
|
371
|
+
|
|
372
|
+
# Validate output
|
|
373
|
+
if output is None:
|
|
374
|
+
result.error = "Translation returned None"
|
|
375
|
+
self.log(f" ERROR: {result.error}")
|
|
376
|
+
return result
|
|
377
|
+
|
|
378
|
+
output_preview = output[:50] if output else 'EMPTY'
|
|
379
|
+
self.log(f" DEBUG: Translation received: '{output_preview}...'")
|
|
380
|
+
|
|
381
|
+
result.output = output if isinstance(output, str) else str(output)
|
|
382
|
+
result.latency_ms = elapsed_time * 1000
|
|
383
|
+
|
|
384
|
+
except Exception as translate_err:
|
|
385
|
+
result.error = f"Translation failed: {str(translate_err)}"
|
|
386
|
+
self.log(f" ERROR: {result.error}")
|
|
387
|
+
import traceback
|
|
388
|
+
self.log(f" TRACEBACK: {traceback.format_exc()}")
|
|
389
|
+
return result
|
|
390
|
+
|
|
391
|
+
# Calculate quality score if reference is available
|
|
392
|
+
if self.chrf_metric and segment.reference and segment.reference.strip():
|
|
393
|
+
try:
|
|
394
|
+
score = self.chrf_metric.corpus_score([output], [[segment.reference]])
|
|
395
|
+
result.quality_score = score.score
|
|
396
|
+
self.log(f" DEBUG: chrF++ score calculated: {result.quality_score:.1f}")
|
|
397
|
+
except Exception as score_err:
|
|
398
|
+
self.log(f" WARNING: chrF++ scoring failed for segment {segment.id}: {score_err}")
|
|
399
|
+
result.quality_score = None
|
|
400
|
+
|
|
401
|
+
# TODO: Token counting and cost estimation
|
|
402
|
+
# Would need to access response metadata from LLM client
|
|
403
|
+
|
|
404
|
+
except Exception as e:
|
|
405
|
+
import traceback
|
|
406
|
+
result.error = str(e)
|
|
407
|
+
error_details = traceback.format_exc()
|
|
408
|
+
self.log(f" ERROR: Exception translating segment {segment.id} with {model_config.name}")
|
|
409
|
+
self.log(f" ERROR: {str(e)}")
|
|
410
|
+
self.log(f" ERROR: Traceback:\n{error_details}")
|
|
411
|
+
|
|
412
|
+
return result
|
|
413
|
+
|
|
414
|
+
def cancel_benchmark(self):
|
|
415
|
+
"""Request cancellation of running benchmark"""
|
|
416
|
+
self.cancel_requested = True
|
|
417
|
+
|
|
418
|
+
def get_summary_stats(self) -> Dict:
|
|
419
|
+
"""
|
|
420
|
+
Calculate summary statistics from benchmark results
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
Dict with stats per model:
|
|
424
|
+
{
|
|
425
|
+
"model_name": {
|
|
426
|
+
"avg_latency_ms": float,
|
|
427
|
+
"avg_quality_score": float,
|
|
428
|
+
"success_count": int,
|
|
429
|
+
"error_count": int,
|
|
430
|
+
"total_cost": float
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
"""
|
|
434
|
+
stats = {}
|
|
435
|
+
|
|
436
|
+
for result in self.results:
|
|
437
|
+
if result.model_name not in stats:
|
|
438
|
+
stats[result.model_name] = {
|
|
439
|
+
"latencies": [],
|
|
440
|
+
"quality_scores": [],
|
|
441
|
+
"success_count": 0,
|
|
442
|
+
"error_count": 0,
|
|
443
|
+
"total_cost": 0.0
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
model_stats = stats[result.model_name]
|
|
447
|
+
|
|
448
|
+
if result.error:
|
|
449
|
+
model_stats["error_count"] += 1
|
|
450
|
+
else:
|
|
451
|
+
model_stats["success_count"] += 1
|
|
452
|
+
model_stats["latencies"].append(result.latency_ms)
|
|
453
|
+
|
|
454
|
+
if result.quality_score is not None:
|
|
455
|
+
model_stats["quality_scores"].append(result.quality_score)
|
|
456
|
+
|
|
457
|
+
if result.cost_estimate is not None:
|
|
458
|
+
model_stats["total_cost"] += result.cost_estimate
|
|
459
|
+
|
|
460
|
+
# Calculate averages
|
|
461
|
+
summary = {}
|
|
462
|
+
for model_name, data in stats.items():
|
|
463
|
+
summary[model_name] = {
|
|
464
|
+
"avg_latency_ms": sum(data["latencies"]) / len(data["latencies"]) if data["latencies"] else 0,
|
|
465
|
+
"avg_quality_score": sum(data["quality_scores"]) / len(data["quality_scores"]) if data["quality_scores"] else None,
|
|
466
|
+
"success_count": data["success_count"],
|
|
467
|
+
"error_count": data["error_count"],
|
|
468
|
+
"total_cost": data["total_cost"]
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
return summary
|
|
472
|
+
|
|
473
|
+
def export_to_dict(self) -> Dict:
|
|
474
|
+
"""Export results to dictionary for JSON/Excel export"""
|
|
475
|
+
return {
|
|
476
|
+
"results": [asdict(r) for r in self.results],
|
|
477
|
+
"summary": self.get_summary_stats()
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def create_sample_datasets() -> List[TestDataset]:
|
|
482
|
+
"""Create sample test datasets for quick testing"""
|
|
483
|
+
|
|
484
|
+
# Business EN→NL dataset
|
|
485
|
+
business_en_nl = TestDataset(
|
|
486
|
+
name="Business EN→NL",
|
|
487
|
+
description="Formal business correspondence and documents"
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
business_segments = [
|
|
491
|
+
TestSegment(1, "We are pleased to inform you that your order has been processed.",
|
|
492
|
+
"Wij zijn verheugd u te kunnen mededelen dat uw bestelling is verwerkt.",
|
|
493
|
+
"business", "EN→NL", "formal business email"),
|
|
494
|
+
TestSegment(2, "Please find attached the invoice for your recent purchase.",
|
|
495
|
+
"In de bijlage treft u de factuur aan voor uw recente aankoop.",
|
|
496
|
+
"business", "EN→NL", "business correspondence"),
|
|
497
|
+
TestSegment(3, "We would like to schedule a meeting to discuss the project timeline.",
|
|
498
|
+
"Wij willen graag een vergadering plannen om de projectplanning te bespreken.",
|
|
499
|
+
"business", "EN→NL", "project management"),
|
|
500
|
+
TestSegment(4, "The annual report will be published next quarter.",
|
|
501
|
+
"Het jaarverslag zal volgend kwartaal worden gepubliceerd.",
|
|
502
|
+
"business", "EN→NL", "corporate communication"),
|
|
503
|
+
TestSegment(5, "Thank you for your prompt response to our inquiry.",
|
|
504
|
+
"Hartelijk dank voor uw snelle reactie op onze vraag.",
|
|
505
|
+
"business", "EN→NL", "business email"),
|
|
506
|
+
]
|
|
507
|
+
|
|
508
|
+
for seg in business_segments:
|
|
509
|
+
business_en_nl.add_segment(seg)
|
|
510
|
+
|
|
511
|
+
# Technical EN→NL dataset
|
|
512
|
+
technical_en_nl = TestDataset(
|
|
513
|
+
name="Technical EN→NL",
|
|
514
|
+
description="Technical documentation and user manuals"
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
technical_segments = [
|
|
518
|
+
TestSegment(1, "Press the power button to turn on the device.",
|
|
519
|
+
"Druk op de aan/uit-knop om het apparaat in te schakelen.",
|
|
520
|
+
"technical", "EN→NL", "user manual"),
|
|
521
|
+
TestSegment(2, "The software supports Windows 10 and later versions.",
|
|
522
|
+
"De software ondersteunt Windows 10 en latere versies.",
|
|
523
|
+
"technical", "EN→NL", "system requirements"),
|
|
524
|
+
TestSegment(3, "Ensure that all cables are properly connected before starting.",
|
|
525
|
+
"Zorg ervoor dat alle kabels correct zijn aangesloten voordat u begint.",
|
|
526
|
+
"technical", "EN→NL", "installation guide"),
|
|
527
|
+
TestSegment(4, "The battery life is approximately 8 hours under normal usage.",
|
|
528
|
+
"De batterijduur bedraagt ongeveer 8 uur bij normaal gebruik.",
|
|
529
|
+
"technical", "EN→NL", "product specifications"),
|
|
530
|
+
TestSegment(5, "For technical support, please contact our service department.",
|
|
531
|
+
"Neem voor technische ondersteuning contact op met onze serviceafdeling.",
|
|
532
|
+
"technical", "EN→NL", "support information"),
|
|
533
|
+
]
|
|
534
|
+
|
|
535
|
+
for seg in technical_segments:
|
|
536
|
+
technical_en_nl.add_segment(seg)
|
|
537
|
+
|
|
538
|
+
# Legal NL→EN dataset
|
|
539
|
+
legal_nl_en = TestDataset(
|
|
540
|
+
name="Legal NL→EN",
|
|
541
|
+
description="Legal contracts and formal documents"
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
legal_segments = [
|
|
545
|
+
TestSegment(1, "De partijen zijn overeengekomen als volgt.",
|
|
546
|
+
"The parties have agreed as follows.",
|
|
547
|
+
"legal", "NL→EN", "contract clause"),
|
|
548
|
+
TestSegment(2, "Deze overeenkomst treedt in werking op de datum van ondertekening.",
|
|
549
|
+
"This agreement shall enter into force on the date of signature.",
|
|
550
|
+
"legal", "NL→EN", "contract terms"),
|
|
551
|
+
TestSegment(3, "Beide partijen verklaren bevoegd te zijn deze overeenkomst aan te gaan.",
|
|
552
|
+
"Both parties declare to be authorized to enter into this agreement.",
|
|
553
|
+
"legal", "NL→EN", "legal declaration"),
|
|
554
|
+
TestSegment(4, "In geval van geschillen zal bemiddeling worden gezocht.",
|
|
555
|
+
"In case of disputes, mediation shall be sought.",
|
|
556
|
+
"legal", "NL→EN", "dispute resolution"),
|
|
557
|
+
TestSegment(5, "Deze overeenkomst is onderworpen aan Nederlands recht.",
|
|
558
|
+
"This agreement is governed by Dutch law.",
|
|
559
|
+
"legal", "NL→EN", "governing law"),
|
|
560
|
+
]
|
|
561
|
+
|
|
562
|
+
for seg in legal_segments:
|
|
563
|
+
legal_nl_en.add_segment(seg)
|
|
564
|
+
|
|
565
|
+
return [business_en_nl, technical_en_nl, legal_nl_en]
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def create_dataset_from_project(
|
|
569
|
+
project,
|
|
570
|
+
sample_size: int = 10,
|
|
571
|
+
sampling_method: str = "smart",
|
|
572
|
+
require_targets: bool = False
|
|
573
|
+
) -> Tuple[TestDataset, Dict]:
|
|
574
|
+
"""
|
|
575
|
+
Create test dataset from current Supervertaler project
|
|
576
|
+
|
|
577
|
+
Supports two scenarios:
|
|
578
|
+
- Translated projects: Uses existing targets as reference for quality scoring
|
|
579
|
+
- Untranslated projects: No references, compare speed/cost/outputs only
|
|
580
|
+
|
|
581
|
+
Args:
|
|
582
|
+
project: Supervertaler project object with segments
|
|
583
|
+
sample_size: Number of segments to include (default 10)
|
|
584
|
+
sampling_method: "random", "evenly_spaced", or "smart" (default)
|
|
585
|
+
require_targets: If True, only include segments with targets
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
Tuple of (TestDataset, metadata_dict)
|
|
589
|
+
metadata_dict contains info about reference availability
|
|
590
|
+
"""
|
|
591
|
+
# Create dataset with project info
|
|
592
|
+
dataset = TestDataset(
|
|
593
|
+
name=f"Project: {getattr(project, 'name', 'Current')}",
|
|
594
|
+
description=f"Sample from current project ({getattr(project, 'source_lang', '??')}→{getattr(project, 'target_lang', '??')})"
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
# Get eligible segments
|
|
598
|
+
eligible_segments = []
|
|
599
|
+
translated_count = 0
|
|
600
|
+
|
|
601
|
+
for seg in project.segments:
|
|
602
|
+
has_source = seg.source and seg.source.strip()
|
|
603
|
+
has_target = seg.target and seg.target.strip()
|
|
604
|
+
|
|
605
|
+
if has_target:
|
|
606
|
+
translated_count += 1
|
|
607
|
+
|
|
608
|
+
if require_targets:
|
|
609
|
+
# Only segments with existing translations
|
|
610
|
+
if has_source and has_target:
|
|
611
|
+
eligible_segments.append(seg)
|
|
612
|
+
else:
|
|
613
|
+
# All segments with source text
|
|
614
|
+
if has_source:
|
|
615
|
+
eligible_segments.append(seg)
|
|
616
|
+
|
|
617
|
+
# Check if we have enough segments
|
|
618
|
+
if len(eligible_segments) == 0:
|
|
619
|
+
raise ValueError("No eligible segments found in project")
|
|
620
|
+
|
|
621
|
+
# Sample segments
|
|
622
|
+
actual_sample_size = min(sample_size, len(eligible_segments))
|
|
623
|
+
sampled = _sample_segments(eligible_segments, actual_sample_size, sampling_method)
|
|
624
|
+
|
|
625
|
+
# Create TestSegments
|
|
626
|
+
reference_count = 0
|
|
627
|
+
for i, seg in enumerate(sampled, 1):
|
|
628
|
+
# Use target as reference if available
|
|
629
|
+
reference = seg.target if (seg.target and seg.target.strip()) else ""
|
|
630
|
+
if reference:
|
|
631
|
+
reference_count += 1
|
|
632
|
+
|
|
633
|
+
test_seg = TestSegment(
|
|
634
|
+
id=seg.id,
|
|
635
|
+
source=seg.source,
|
|
636
|
+
reference=reference,
|
|
637
|
+
domain=getattr(project, 'domain', 'general'),
|
|
638
|
+
direction=f"{getattr(project, 'source_lang', 'XX')}→{getattr(project, 'target_lang', 'XX')}",
|
|
639
|
+
context=""
|
|
640
|
+
)
|
|
641
|
+
dataset.add_segment(test_seg)
|
|
642
|
+
|
|
643
|
+
# Build metadata
|
|
644
|
+
metadata = {
|
|
645
|
+
"total_segments": len(project.segments),
|
|
646
|
+
"translated_count": translated_count,
|
|
647
|
+
"translation_percentage": (translated_count / len(project.segments) * 100) if project.segments else 0,
|
|
648
|
+
"eligible_segments": len(eligible_segments),
|
|
649
|
+
"sampled_segments": len(sampled),
|
|
650
|
+
"segments_with_references": reference_count,
|
|
651
|
+
"has_references": reference_count > 0,
|
|
652
|
+
"quality_scoring_available": reference_count > 0,
|
|
653
|
+
"sampling_method": sampling_method
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
return dataset, metadata
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def _sample_segments(segments: List, n: int, method: str) -> List:
|
|
660
|
+
"""
|
|
661
|
+
Sample segments from project using specified method
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
segments: List of segment objects
|
|
665
|
+
n: Number of segments to sample
|
|
666
|
+
method: "random", "evenly_spaced", or "smart"
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
List of sampled segments
|
|
670
|
+
"""
|
|
671
|
+
if len(segments) <= n:
|
|
672
|
+
return segments
|
|
673
|
+
|
|
674
|
+
if method == "random":
|
|
675
|
+
return random.sample(segments, n)
|
|
676
|
+
|
|
677
|
+
elif method == "evenly_spaced":
|
|
678
|
+
# Stratified sampling - every Nth segment
|
|
679
|
+
step = len(segments) / n
|
|
680
|
+
return [segments[int(i * step)] for i in range(n)]
|
|
681
|
+
|
|
682
|
+
elif method == "smart":
|
|
683
|
+
# Smart sampling: representative coverage across document
|
|
684
|
+
# 30% from beginning, 40% from middle, 30% from end
|
|
685
|
+
# Ensures coverage of introduction, main content, and conclusions
|
|
686
|
+
|
|
687
|
+
# Divide into sections
|
|
688
|
+
third = len(segments) // 3
|
|
689
|
+
begin = segments[:third]
|
|
690
|
+
middle = segments[third:2*third]
|
|
691
|
+
end = segments[2*third:]
|
|
692
|
+
|
|
693
|
+
# Calculate samples per section
|
|
694
|
+
n_begin = int(n * 0.3)
|
|
695
|
+
n_middle = int(n * 0.4)
|
|
696
|
+
n_end = n - n_begin - n_middle
|
|
697
|
+
|
|
698
|
+
# Sample from each section
|
|
699
|
+
sampled = []
|
|
700
|
+
if begin and n_begin > 0:
|
|
701
|
+
sampled.extend(random.sample(begin, min(n_begin, len(begin))))
|
|
702
|
+
if middle and n_middle > 0:
|
|
703
|
+
sampled.extend(random.sample(middle, min(n_middle, len(middle))))
|
|
704
|
+
if end and n_end > 0:
|
|
705
|
+
sampled.extend(random.sample(end, min(n_end, len(end))))
|
|
706
|
+
|
|
707
|
+
# If we didn't get enough, fill from remaining
|
|
708
|
+
if len(sampled) < n:
|
|
709
|
+
remaining = [s for s in segments if s not in sampled]
|
|
710
|
+
if remaining:
|
|
711
|
+
needed = n - len(sampled)
|
|
712
|
+
sampled.extend(random.sample(remaining, min(needed, len(remaining))))
|
|
713
|
+
|
|
714
|
+
return sampled
|
|
715
|
+
|
|
716
|
+
else:
|
|
717
|
+
# Default to random if method not recognized
|
|
718
|
+
return random.sample(segments, n)
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
# For testing/development
|
|
722
|
+
if __name__ == "__main__":
|
|
723
|
+
print("LLM Leaderboard - Core Module")
|
|
724
|
+
print("=" * 50)
|
|
725
|
+
|
|
726
|
+
# Create sample datasets
|
|
727
|
+
datasets = create_sample_datasets()
|
|
728
|
+
|
|
729
|
+
print(f"\nCreated {len(datasets)} sample datasets:")
|
|
730
|
+
for ds in datasets:
|
|
731
|
+
print(f" • {ds.name}: {len(ds.segments)} segments")
|
|
732
|
+
|
|
733
|
+
# Check if chrF++ is available
|
|
734
|
+
if CHRF_AVAILABLE:
|
|
735
|
+
print("\n✅ chrF++ quality scoring available")
|
|
736
|
+
else:
|
|
737
|
+
print("\nWarning: chrF++ quality scoring not available (sacrebleu not installed)")
|