supervertaler 1.9.163__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. Supervertaler.py +48473 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1911 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +351 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1176 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.163.dist-info/METADATA +906 -0
  81. supervertaler-1.9.163.dist-info/RECORD +85 -0
  82. supervertaler-1.9.163.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.163.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.163.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.163.dist-info/top_level.txt +2 -0
@@ -0,0 +1,737 @@
1
+ """
2
+ LLM Leaderboard - Core Benchmarking Module
3
+ ===========================================
4
+
5
+ Comprehensive LLM translation benchmarking system for Supervertaler.
6
+ Compare translation quality, speed, and cost across multiple providers.
7
+
8
+ Features:
9
+ - Multi-provider comparison (OpenAI, Claude, Gemini)
10
+ - Quality scoring (chrF++ metric)
11
+ - Speed measurement (latency per segment)
12
+ - Cost estimation (token-based)
13
+ - Test dataset management
14
+ - Results export (Excel/CSV)
15
+
16
+ Author: Michael Beijer
17
+ License: MIT
18
+ """
19
+
20
+ import time
21
+ import json
22
+ import random
23
+ from pathlib import Path
24
+ from typing import Dict, List, Optional, Tuple
25
+ from dataclasses import dataclass, asdict
26
+ import threading
27
+
28
+ try:
29
+ from sacrebleu.metrics import CHRF
30
+ CHRF_AVAILABLE = True
31
+ except ImportError:
32
+ CHRF_AVAILABLE = False
33
+ print("Warning: sacrebleu not installed. Quality scoring will be disabled.")
34
+ print("Install with: pip install sacrebleu")
35
+
36
+
37
+ @dataclass
38
+ class TestSegment:
39
+ """Single test segment with source and reference translation"""
40
+ id: int
41
+ source: str
42
+ reference: str
43
+ domain: str = "general"
44
+ direction: str = "EN→NL"
45
+ context: str = ""
46
+
47
+
48
+ @dataclass
49
+ class BenchmarkResult:
50
+ """Result of translating a single segment with one model"""
51
+ segment_id: int
52
+ model_name: str
53
+ provider: str
54
+ model_id: str
55
+ output: str
56
+ latency_ms: float
57
+ quality_score: Optional[float] = None
58
+ error: Optional[str] = None
59
+ tokens_input: Optional[int] = None
60
+ tokens_output: Optional[int] = None
61
+ cost_estimate: Optional[float] = None
62
+
63
+
64
+ @dataclass
65
+ class ModelConfig:
66
+ """Configuration for a single model to test"""
67
+ name: str # Display name (e.g., "GPT-4o")
68
+ provider: str # "openai", "claude", "gemini"
69
+ model_id: str # Actual model ID for API
70
+ enabled: bool = True
71
+
72
+
73
+ class TestDataset:
74
+ """Manages test datasets for benchmarking"""
75
+
76
+ def __init__(self, name: str, description: str = ""):
77
+ self.name = name
78
+ self.description = description
79
+ self.segments: List[TestSegment] = []
80
+
81
+ def add_segment(self, segment: TestSegment):
82
+ """Add a test segment to the dataset"""
83
+ self.segments.append(segment)
84
+
85
+ def to_dict(self) -> Dict:
86
+ """Convert dataset to dictionary for JSON export"""
87
+ return {
88
+ "name": self.name,
89
+ "description": self.description,
90
+ "segments": [asdict(seg) for seg in self.segments]
91
+ }
92
+
93
+ @classmethod
94
+ def from_dict(cls, data: Dict) -> 'TestDataset':
95
+ """Load dataset from dictionary"""
96
+ dataset = cls(data["name"], data.get("description", ""))
97
+ for seg_data in data.get("segments", []):
98
+ dataset.add_segment(TestSegment(**seg_data))
99
+ return dataset
100
+
101
+ @classmethod
102
+ def from_json_file(cls, filepath: Path) -> 'TestDataset':
103
+ """Load dataset from JSON file"""
104
+ with open(filepath, 'r', encoding='utf-8') as f:
105
+ data = json.load(f)
106
+ return cls.from_dict(data)
107
+
108
+ def save_to_json(self, filepath: Path):
109
+ """Save dataset to JSON file"""
110
+ with open(filepath, 'w', encoding='utf-8') as f:
111
+ json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
112
+
113
+
114
+ class LLMLeaderboard:
115
+ """Main benchmarking engine for LLM translation comparison"""
116
+
117
+ def __init__(self, llm_client_factory, log_callback=None):
118
+ """
119
+ Initialize LLM Leaderboard
120
+
121
+ Args:
122
+ llm_client_factory: Function that creates LLMClient instances
123
+ Signature: (provider: str, model: str) -> LLMClient
124
+ log_callback: Optional callback for logging messages
125
+ """
126
+ self.llm_client_factory = llm_client_factory
127
+ self.log = log_callback if log_callback else print
128
+ self.chrf_metric = CHRF(word_order=2) if CHRF_AVAILABLE else None
129
+ self.results: List[BenchmarkResult] = []
130
+ self.is_running = False
131
+ self.cancel_requested = False
132
+
133
+ def _lang_code_to_name(self, code: str) -> str:
134
+ """Convert language code to full language name for LLM prompts"""
135
+ # Common language codes to names mapping
136
+ lang_map = {
137
+ "en": "English", "en-us": "English", "en-gb": "English",
138
+ "nl": "Dutch", "nl-nl": "Dutch", "nl-be": "Dutch (Belgian)",
139
+ "de": "German", "de-de": "German", "de-at": "German (Austrian)",
140
+ "fr": "French", "fr-fr": "French", "fr-be": "French (Belgian)",
141
+ "es": "Spanish", "es-es": "Spanish", "es-mx": "Spanish (Mexican)",
142
+ "it": "Italian", "it-it": "Italian",
143
+ "pt": "Portuguese", "pt-pt": "Portuguese", "pt-br": "Portuguese (Brazilian)",
144
+ "ru": "Russian", "ru-ru": "Russian",
145
+ "zh": "Chinese", "zh-cn": "Chinese (Simplified)", "zh-tw": "Chinese (Traditional)",
146
+ "ja": "Japanese", "ja-jp": "Japanese",
147
+ "ko": "Korean", "ko-kr": "Korean",
148
+ "ar": "Arabic", "ar-sa": "Arabic",
149
+ "pl": "Polish", "pl-pl": "Polish",
150
+ "sv": "Swedish", "sv-se": "Swedish",
151
+ "da": "Danish", "da-dk": "Danish",
152
+ "no": "Norwegian", "nb-no": "Norwegian",
153
+ "fi": "Finnish", "fi-fi": "Finnish",
154
+ "cs": "Czech", "cs-cz": "Czech",
155
+ "tr": "Turkish", "tr-tr": "Turkish",
156
+ "el": "Greek", "el-gr": "Greek",
157
+ "he": "Hebrew", "he-il": "Hebrew",
158
+ "hi": "Hindi", "hi-in": "Hindi",
159
+ "th": "Thai", "th-th": "Thai",
160
+ "vi": "Vietnamese", "vi-vn": "Vietnamese",
161
+ "id": "Indonesian", "id-id": "Indonesian",
162
+ "ms": "Malay", "ms-my": "Malay",
163
+ "uk": "Ukrainian", "uk-ua": "Ukrainian",
164
+ "ro": "Romanian", "ro-ro": "Romanian",
165
+ "hu": "Hungarian", "hu-hu": "Hungarian",
166
+ "bg": "Bulgarian", "bg-bg": "Bulgarian",
167
+ "hr": "Croatian", "hr-hr": "Croatian",
168
+ "sr": "Serbian", "sr-rs": "Serbian",
169
+ "sk": "Slovak", "sk-sk": "Slovak",
170
+ "sl": "Slovenian", "sl-si": "Slovenian",
171
+ "lt": "Lithuanian", "lt-lt": "Lithuanian",
172
+ "lv": "Latvian", "lv-lv": "Latvian",
173
+ "et": "Estonian", "et-ee": "Estonian",
174
+ }
175
+
176
+ # Normalize code to lowercase
177
+ code_lower = code.lower().strip()
178
+
179
+ # Return mapped name or capitalize the code as fallback
180
+ return lang_map.get(code_lower, code.upper())
181
+
182
+ def build_translation_prompt(self, segment: TestSegment) -> str:
183
+ """Build translation prompt for a test segment"""
184
+ # Extract language codes from direction (e.g., "EN→NL" or "en→nl")
185
+ parts = segment.direction.split("→")
186
+ source_code = parts[0].strip() if len(parts) > 0 else "source language"
187
+ target_code = parts[1].strip() if len(parts) > 1 else "target language"
188
+
189
+ # Convert codes to full language names
190
+ source_lang = self._lang_code_to_name(source_code)
191
+ target_lang = self._lang_code_to_name(target_code)
192
+ direction_hint = f"from {source_lang} to {target_lang}"
193
+
194
+ prompt = f"""You are a professional translator. Translate the following text {direction_hint}.
195
+
196
+ Domain: {segment.domain}
197
+ Target language: {target_lang}
198
+ Requirements: Be faithful to meaning, natural, and correct. Preserve units, numbers, and formatting. Keep terminology consistent.
199
+
200
+ Text to translate:
201
+ {segment.source}
202
+
203
+ Return ONLY the translation, no explanations or additional text."""
204
+
205
+ if segment.context:
206
+ prompt = f"Context: {segment.context}\n\n{prompt}"
207
+
208
+ return prompt
209
+
210
+ def run_benchmark(
211
+ self,
212
+ dataset: TestDataset,
213
+ models: List[ModelConfig],
214
+ progress_callback=None
215
+ ) -> List[BenchmarkResult]:
216
+ """
217
+ Run benchmark comparing multiple models on a test dataset
218
+
219
+ Args:
220
+ dataset: TestDataset to run
221
+ models: List of ModelConfig to test
222
+ progress_callback: Optional callback(current, total, message)
223
+
224
+ Returns:
225
+ List of BenchmarkResult objects
226
+ """
227
+ self.is_running = True
228
+ self.cancel_requested = False
229
+ self.results = []
230
+
231
+ enabled_models = [m for m in models if m.enabled]
232
+ total_tests = len(dataset.segments) * len(enabled_models)
233
+ current_test = 0
234
+
235
+ self.log(f"Starting benchmark: {dataset.name}")
236
+ self.log(f" Models: {', '.join(m.name for m in enabled_models)}")
237
+ self.log(f" Segments: {len(dataset.segments)}")
238
+ self.log(f" Total translations: {total_tests}")
239
+
240
+ for segment in dataset.segments:
241
+ if self.cancel_requested:
242
+ self.log("Warning: Benchmark cancelled by user")
243
+ break
244
+
245
+ prompt = self.build_translation_prompt(segment)
246
+
247
+ for model_config in enabled_models:
248
+ if self.cancel_requested:
249
+ break
250
+
251
+ current_test += 1
252
+
253
+ # Progress update
254
+ if progress_callback:
255
+ progress_callback(
256
+ current_test,
257
+ total_tests,
258
+ f"Testing {model_config.name} on segment {segment.id}"
259
+ )
260
+
261
+ # Run translation and measure time
262
+ result = self._translate_segment(
263
+ segment,
264
+ model_config,
265
+ prompt
266
+ )
267
+
268
+ self.results.append(result)
269
+
270
+ # Log result
271
+ if result.error:
272
+ self.log(f" ERROR {model_config.name} seg {segment.id}: {result.error}")
273
+ else:
274
+ quality_str = f", chrF++: {result.quality_score:.1f}" if result.quality_score else ""
275
+ self.log(f" OK {model_config.name} seg {segment.id}: {result.latency_ms:.0f}ms{quality_str}")
276
+
277
+ self.is_running = False
278
+ self.log(f"Benchmark complete: {len(self.results)} results")
279
+
280
+ return self.results
281
+
282
+ def _translate_segment(
283
+ self,
284
+ segment: TestSegment,
285
+ model_config: ModelConfig,
286
+ prompt: str
287
+ ) -> BenchmarkResult:
288
+ """Translate a single segment with one model and measure performance"""
289
+
290
+ result = BenchmarkResult(
291
+ segment_id=segment.id,
292
+ model_name=model_config.name,
293
+ provider=model_config.provider,
294
+ model_id=model_config.model_id,
295
+ output="",
296
+ latency_ms=0.0
297
+ )
298
+
299
+ try:
300
+ self.log(f" DEBUG: Starting translation for segment {segment.id} with {model_config.name}")
301
+
302
+ # Validate segment data first
303
+ if not segment:
304
+ result.error = "Null segment"
305
+ self.log(f" ERROR: Null segment received")
306
+ return result
307
+
308
+ if not hasattr(segment, 'id') or segment.id is None:
309
+ result.error = "Segment missing ID"
310
+ self.log(f" ERROR: Segment missing ID")
311
+ return result
312
+
313
+ if not hasattr(segment, 'source'):
314
+ result.error = f"Segment {segment.id} missing source attribute"
315
+ self.log(f" ERROR: {result.error}")
316
+ return result
317
+
318
+ # Validate segment source text
319
+ if not segment.source or not segment.source.strip():
320
+ result.error = "Empty source text"
321
+ self.log(f" ERROR: Segment {segment.id} has empty source text")
322
+ return result
323
+
324
+ # Parse language codes from direction
325
+ if not hasattr(segment, 'direction') or not segment.direction:
326
+ result.error = f"Segment {segment.id} missing direction"
327
+ self.log(f" ERROR: {result.error}")
328
+ return result
329
+
330
+ try:
331
+ direction_parts = segment.direction.split("→")
332
+ if len(direction_parts) != 2:
333
+ raise ValueError(f"Invalid direction format: {segment.direction}")
334
+ source_lang = direction_parts[0].strip().lower()
335
+ target_lang = direction_parts[1].strip().lower()
336
+
337
+ if not source_lang or not target_lang:
338
+ raise ValueError(f"Empty language codes in direction: {segment.direction}")
339
+ except Exception as lang_err:
340
+ result.error = f"Invalid language direction: {segment.direction} - {str(lang_err)}"
341
+ self.log(f" ERROR: {result.error}")
342
+ return result
343
+
344
+ # Create LLM client with error handling
345
+ try:
346
+ self.log(f" DEBUG: Creating client for {model_config.provider} with model {model_config.model_id}")
347
+ client = self.llm_client_factory(model_config.provider, model_config.model_id)
348
+ if not client:
349
+ result.error = f"Failed to create {model_config.provider} client"
350
+ self.log(f" ERROR: {result.error}")
351
+ return result
352
+ self.log(f" DEBUG: Client created successfully")
353
+ except Exception as client_err:
354
+ result.error = f"Client creation failed: {str(client_err)}"
355
+ self.log(f" ERROR: {result.error}")
356
+ return result
357
+
358
+ # Measure translation time with comprehensive error handling
359
+ try:
360
+ start_time = time.perf_counter()
361
+ source_preview = segment.source[:50] + "..." if len(segment.source) > 50 else segment.source
362
+ self.log(f" DEBUG: Calling translate with text='{source_preview}', source={source_lang}, target={target_lang}")
363
+
364
+ output = client.translate(
365
+ text=segment.source,
366
+ source_lang=source_lang,
367
+ target_lang=target_lang,
368
+ custom_prompt=prompt
369
+ )
370
+ elapsed_time = time.perf_counter() - start_time
371
+
372
+ # Validate output
373
+ if output is None:
374
+ result.error = "Translation returned None"
375
+ self.log(f" ERROR: {result.error}")
376
+ return result
377
+
378
+ output_preview = output[:50] if output else 'EMPTY'
379
+ self.log(f" DEBUG: Translation received: '{output_preview}...'")
380
+
381
+ result.output = output if isinstance(output, str) else str(output)
382
+ result.latency_ms = elapsed_time * 1000
383
+
384
+ except Exception as translate_err:
385
+ result.error = f"Translation failed: {str(translate_err)}"
386
+ self.log(f" ERROR: {result.error}")
387
+ import traceback
388
+ self.log(f" TRACEBACK: {traceback.format_exc()}")
389
+ return result
390
+
391
+ # Calculate quality score if reference is available
392
+ if self.chrf_metric and segment.reference and segment.reference.strip():
393
+ try:
394
+ score = self.chrf_metric.corpus_score([output], [[segment.reference]])
395
+ result.quality_score = score.score
396
+ self.log(f" DEBUG: chrF++ score calculated: {result.quality_score:.1f}")
397
+ except Exception as score_err:
398
+ self.log(f" WARNING: chrF++ scoring failed for segment {segment.id}: {score_err}")
399
+ result.quality_score = None
400
+
401
+ # TODO: Token counting and cost estimation
402
+ # Would need to access response metadata from LLM client
403
+
404
+ except Exception as e:
405
+ import traceback
406
+ result.error = str(e)
407
+ error_details = traceback.format_exc()
408
+ self.log(f" ERROR: Exception translating segment {segment.id} with {model_config.name}")
409
+ self.log(f" ERROR: {str(e)}")
410
+ self.log(f" ERROR: Traceback:\n{error_details}")
411
+
412
+ return result
413
+
414
+ def cancel_benchmark(self):
415
+ """Request cancellation of running benchmark"""
416
+ self.cancel_requested = True
417
+
418
+ def get_summary_stats(self) -> Dict:
419
+ """
420
+ Calculate summary statistics from benchmark results
421
+
422
+ Returns:
423
+ Dict with stats per model:
424
+ {
425
+ "model_name": {
426
+ "avg_latency_ms": float,
427
+ "avg_quality_score": float,
428
+ "success_count": int,
429
+ "error_count": int,
430
+ "total_cost": float
431
+ }
432
+ }
433
+ """
434
+ stats = {}
435
+
436
+ for result in self.results:
437
+ if result.model_name not in stats:
438
+ stats[result.model_name] = {
439
+ "latencies": [],
440
+ "quality_scores": [],
441
+ "success_count": 0,
442
+ "error_count": 0,
443
+ "total_cost": 0.0
444
+ }
445
+
446
+ model_stats = stats[result.model_name]
447
+
448
+ if result.error:
449
+ model_stats["error_count"] += 1
450
+ else:
451
+ model_stats["success_count"] += 1
452
+ model_stats["latencies"].append(result.latency_ms)
453
+
454
+ if result.quality_score is not None:
455
+ model_stats["quality_scores"].append(result.quality_score)
456
+
457
+ if result.cost_estimate is not None:
458
+ model_stats["total_cost"] += result.cost_estimate
459
+
460
+ # Calculate averages
461
+ summary = {}
462
+ for model_name, data in stats.items():
463
+ summary[model_name] = {
464
+ "avg_latency_ms": sum(data["latencies"]) / len(data["latencies"]) if data["latencies"] else 0,
465
+ "avg_quality_score": sum(data["quality_scores"]) / len(data["quality_scores"]) if data["quality_scores"] else None,
466
+ "success_count": data["success_count"],
467
+ "error_count": data["error_count"],
468
+ "total_cost": data["total_cost"]
469
+ }
470
+
471
+ return summary
472
+
473
+ def export_to_dict(self) -> Dict:
474
+ """Export results to dictionary for JSON/Excel export"""
475
+ return {
476
+ "results": [asdict(r) for r in self.results],
477
+ "summary": self.get_summary_stats()
478
+ }
479
+
480
+
481
+ def create_sample_datasets() -> List[TestDataset]:
482
+ """Create sample test datasets for quick testing"""
483
+
484
+ # Business EN→NL dataset
485
+ business_en_nl = TestDataset(
486
+ name="Business EN→NL",
487
+ description="Formal business correspondence and documents"
488
+ )
489
+
490
+ business_segments = [
491
+ TestSegment(1, "We are pleased to inform you that your order has been processed.",
492
+ "Wij zijn verheugd u te kunnen mededelen dat uw bestelling is verwerkt.",
493
+ "business", "EN→NL", "formal business email"),
494
+ TestSegment(2, "Please find attached the invoice for your recent purchase.",
495
+ "In de bijlage treft u de factuur aan voor uw recente aankoop.",
496
+ "business", "EN→NL", "business correspondence"),
497
+ TestSegment(3, "We would like to schedule a meeting to discuss the project timeline.",
498
+ "Wij willen graag een vergadering plannen om de projectplanning te bespreken.",
499
+ "business", "EN→NL", "project management"),
500
+ TestSegment(4, "The annual report will be published next quarter.",
501
+ "Het jaarverslag zal volgend kwartaal worden gepubliceerd.",
502
+ "business", "EN→NL", "corporate communication"),
503
+ TestSegment(5, "Thank you for your prompt response to our inquiry.",
504
+ "Hartelijk dank voor uw snelle reactie op onze vraag.",
505
+ "business", "EN→NL", "business email"),
506
+ ]
507
+
508
+ for seg in business_segments:
509
+ business_en_nl.add_segment(seg)
510
+
511
+ # Technical EN→NL dataset
512
+ technical_en_nl = TestDataset(
513
+ name="Technical EN→NL",
514
+ description="Technical documentation and user manuals"
515
+ )
516
+
517
+ technical_segments = [
518
+ TestSegment(1, "Press the power button to turn on the device.",
519
+ "Druk op de aan/uit-knop om het apparaat in te schakelen.",
520
+ "technical", "EN→NL", "user manual"),
521
+ TestSegment(2, "The software supports Windows 10 and later versions.",
522
+ "De software ondersteunt Windows 10 en latere versies.",
523
+ "technical", "EN→NL", "system requirements"),
524
+ TestSegment(3, "Ensure that all cables are properly connected before starting.",
525
+ "Zorg ervoor dat alle kabels correct zijn aangesloten voordat u begint.",
526
+ "technical", "EN→NL", "installation guide"),
527
+ TestSegment(4, "The battery life is approximately 8 hours under normal usage.",
528
+ "De batterijduur bedraagt ongeveer 8 uur bij normaal gebruik.",
529
+ "technical", "EN→NL", "product specifications"),
530
+ TestSegment(5, "For technical support, please contact our service department.",
531
+ "Neem voor technische ondersteuning contact op met onze serviceafdeling.",
532
+ "technical", "EN→NL", "support information"),
533
+ ]
534
+
535
+ for seg in technical_segments:
536
+ technical_en_nl.add_segment(seg)
537
+
538
+ # Legal NL→EN dataset
539
+ legal_nl_en = TestDataset(
540
+ name="Legal NL→EN",
541
+ description="Legal contracts and formal documents"
542
+ )
543
+
544
+ legal_segments = [
545
+ TestSegment(1, "De partijen zijn overeengekomen als volgt.",
546
+ "The parties have agreed as follows.",
547
+ "legal", "NL→EN", "contract clause"),
548
+ TestSegment(2, "Deze overeenkomst treedt in werking op de datum van ondertekening.",
549
+ "This agreement shall enter into force on the date of signature.",
550
+ "legal", "NL→EN", "contract terms"),
551
+ TestSegment(3, "Beide partijen verklaren bevoegd te zijn deze overeenkomst aan te gaan.",
552
+ "Both parties declare to be authorized to enter into this agreement.",
553
+ "legal", "NL→EN", "legal declaration"),
554
+ TestSegment(4, "In geval van geschillen zal bemiddeling worden gezocht.",
555
+ "In case of disputes, mediation shall be sought.",
556
+ "legal", "NL→EN", "dispute resolution"),
557
+ TestSegment(5, "Deze overeenkomst is onderworpen aan Nederlands recht.",
558
+ "This agreement is governed by Dutch law.",
559
+ "legal", "NL→EN", "governing law"),
560
+ ]
561
+
562
+ for seg in legal_segments:
563
+ legal_nl_en.add_segment(seg)
564
+
565
+ return [business_en_nl, technical_en_nl, legal_nl_en]
566
+
567
+
568
+ def create_dataset_from_project(
569
+ project,
570
+ sample_size: int = 10,
571
+ sampling_method: str = "smart",
572
+ require_targets: bool = False
573
+ ) -> Tuple[TestDataset, Dict]:
574
+ """
575
+ Create test dataset from current Supervertaler project
576
+
577
+ Supports two scenarios:
578
+ - Translated projects: Uses existing targets as reference for quality scoring
579
+ - Untranslated projects: No references, compare speed/cost/outputs only
580
+
581
+ Args:
582
+ project: Supervertaler project object with segments
583
+ sample_size: Number of segments to include (default 10)
584
+ sampling_method: "random", "evenly_spaced", or "smart" (default)
585
+ require_targets: If True, only include segments with targets
586
+
587
+ Returns:
588
+ Tuple of (TestDataset, metadata_dict)
589
+ metadata_dict contains info about reference availability
590
+ """
591
+ # Create dataset with project info
592
+ dataset = TestDataset(
593
+ name=f"Project: {getattr(project, 'name', 'Current')}",
594
+ description=f"Sample from current project ({getattr(project, 'source_lang', '??')}→{getattr(project, 'target_lang', '??')})"
595
+ )
596
+
597
+ # Get eligible segments
598
+ eligible_segments = []
599
+ translated_count = 0
600
+
601
+ for seg in project.segments:
602
+ has_source = seg.source and seg.source.strip()
603
+ has_target = seg.target and seg.target.strip()
604
+
605
+ if has_target:
606
+ translated_count += 1
607
+
608
+ if require_targets:
609
+ # Only segments with existing translations
610
+ if has_source and has_target:
611
+ eligible_segments.append(seg)
612
+ else:
613
+ # All segments with source text
614
+ if has_source:
615
+ eligible_segments.append(seg)
616
+
617
+ # Check if we have enough segments
618
+ if len(eligible_segments) == 0:
619
+ raise ValueError("No eligible segments found in project")
620
+
621
+ # Sample segments
622
+ actual_sample_size = min(sample_size, len(eligible_segments))
623
+ sampled = _sample_segments(eligible_segments, actual_sample_size, sampling_method)
624
+
625
+ # Create TestSegments
626
+ reference_count = 0
627
+ for i, seg in enumerate(sampled, 1):
628
+ # Use target as reference if available
629
+ reference = seg.target if (seg.target and seg.target.strip()) else ""
630
+ if reference:
631
+ reference_count += 1
632
+
633
+ test_seg = TestSegment(
634
+ id=seg.id,
635
+ source=seg.source,
636
+ reference=reference,
637
+ domain=getattr(project, 'domain', 'general'),
638
+ direction=f"{getattr(project, 'source_lang', 'XX')}→{getattr(project, 'target_lang', 'XX')}",
639
+ context=""
640
+ )
641
+ dataset.add_segment(test_seg)
642
+
643
+ # Build metadata
644
+ metadata = {
645
+ "total_segments": len(project.segments),
646
+ "translated_count": translated_count,
647
+ "translation_percentage": (translated_count / len(project.segments) * 100) if project.segments else 0,
648
+ "eligible_segments": len(eligible_segments),
649
+ "sampled_segments": len(sampled),
650
+ "segments_with_references": reference_count,
651
+ "has_references": reference_count > 0,
652
+ "quality_scoring_available": reference_count > 0,
653
+ "sampling_method": sampling_method
654
+ }
655
+
656
+ return dataset, metadata
657
+
658
+
659
+ def _sample_segments(segments: List, n: int, method: str) -> List:
660
+ """
661
+ Sample segments from project using specified method
662
+
663
+ Args:
664
+ segments: List of segment objects
665
+ n: Number of segments to sample
666
+ method: "random", "evenly_spaced", or "smart"
667
+
668
+ Returns:
669
+ List of sampled segments
670
+ """
671
+ if len(segments) <= n:
672
+ return segments
673
+
674
+ if method == "random":
675
+ return random.sample(segments, n)
676
+
677
+ elif method == "evenly_spaced":
678
+ # Stratified sampling - every Nth segment
679
+ step = len(segments) / n
680
+ return [segments[int(i * step)] for i in range(n)]
681
+
682
+ elif method == "smart":
683
+ # Smart sampling: representative coverage across document
684
+ # 30% from beginning, 40% from middle, 30% from end
685
+ # Ensures coverage of introduction, main content, and conclusions
686
+
687
+ # Divide into sections
688
+ third = len(segments) // 3
689
+ begin = segments[:third]
690
+ middle = segments[third:2*third]
691
+ end = segments[2*third:]
692
+
693
+ # Calculate samples per section
694
+ n_begin = int(n * 0.3)
695
+ n_middle = int(n * 0.4)
696
+ n_end = n - n_begin - n_middle
697
+
698
+ # Sample from each section
699
+ sampled = []
700
+ if begin and n_begin > 0:
701
+ sampled.extend(random.sample(begin, min(n_begin, len(begin))))
702
+ if middle and n_middle > 0:
703
+ sampled.extend(random.sample(middle, min(n_middle, len(middle))))
704
+ if end and n_end > 0:
705
+ sampled.extend(random.sample(end, min(n_end, len(end))))
706
+
707
+ # If we didn't get enough, fill from remaining
708
+ if len(sampled) < n:
709
+ remaining = [s for s in segments if s not in sampled]
710
+ if remaining:
711
+ needed = n - len(sampled)
712
+ sampled.extend(random.sample(remaining, min(needed, len(remaining))))
713
+
714
+ return sampled
715
+
716
+ else:
717
+ # Default to random if method not recognized
718
+ return random.sample(segments, n)
719
+
720
+
721
+ # For testing/development
722
+ if __name__ == "__main__":
723
+ print("LLM Leaderboard - Core Module")
724
+ print("=" * 50)
725
+
726
+ # Create sample datasets
727
+ datasets = create_sample_datasets()
728
+
729
+ print(f"\nCreated {len(datasets)} sample datasets:")
730
+ for ds in datasets:
731
+ print(f" • {ds.name}: {len(ds.segments)} segments")
732
+
733
+ # Check if chrF++ is available
734
+ if CHRF_AVAILABLE:
735
+ print("\n✅ chrF++ quality scoring available")
736
+ else:
737
+ print("\nWarning: chrF++ quality scoring not available (sacrebleu not installed)")