supervertaler 1.9.163__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. Supervertaler.py +48473 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1911 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +351 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1176 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.163.dist-info/METADATA +906 -0
  81. supervertaler-1.9.163.dist-info/RECORD +85 -0
  82. supervertaler-1.9.163.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.163.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.163.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.163.dist-info/top_level.txt +2 -0
@@ -0,0 +1,319 @@
1
+ """
2
+ Text Encoding Corruption Detection and Repair Module
3
+
4
+ Detects and fixes common text encoding issues (mojibake), particularly:
5
+ - UTF-8 text incorrectly decoded as Latin-1 (Windows-1252)
6
+ - Double-encoded Unicode escape sequences
7
+ - Common encoding corruption patterns
8
+ """
9
+
10
+ import re
11
+ from pathlib import Path
12
+ from typing import Tuple, List, Dict
13
+
14
+ try:
15
+ import chardet
16
+ CHARDET_AVAILABLE = True
17
+ except ImportError:
18
+ CHARDET_AVAILABLE = False
19
+
20
+
21
+ class EncodingRepair:
22
+ """Detect and repair text encoding corruption."""
23
+
24
+ # Common mojibake patterns (UTF-8 misinterpreted as Latin-1)
25
+ CORRUPTION_PATTERNS = {
26
+ # En dash, em dash, hyphen variants (as literal strings, not Unicode escapes)
27
+ '\\u00e2\\u20ac\\u201c': '–', # en dash
28
+ '\\u00e2\\u20ac\\u201d': '—', # em dash
29
+ '\\u00e2\\u20ac\\u0090': '-', # non-breaking hyphen (3-char sequence)
30
+
31
+ # Standalone Unicode escape sequences (single occurrence)
32
+ '-\\u0090': '-', # hyphen + corruption → single hyphen
33
+ '\\u0090': '', # standalone corruption → remove it
34
+
35
+ # Quotes and apostrophes
36
+ '\\u00e2\\u20ac\\u0153': '"', # left double quote
37
+ '\\u00e2\\u20ac\\u009d': '"', # right double quote
38
+ '\\u00e2\\u20ac\\u0098': '\u2018', # left single quote
39
+ '\\u00e2\\u20ac\\u0099': '\u2019', # right single quote
40
+ '\\u00e2\\u20ac\\u2122': '\u2019', # apostrophe/right single quote
41
+
42
+ # Ellipsis and other punctuation
43
+ '\\u00e2\\u20ac\\u00a6': '…', # ellipsis
44
+ '\\u00e2\\u20ac\\u00a2': '•', # bullet
45
+
46
+ # Spaces
47
+ '\\u00c2\\u00a0': ' ', # non-breaking space
48
+
49
+ # Degree and special symbols
50
+ '\\u00c2\\u00b0': '°', # degree
51
+ '\\u00c3\\u00a9': 'é', # e acute (example)
52
+ }
53
+
54
+ # Regex pattern to find potential corruption sequences
55
+ UNICODE_ESCAPE_PATTERN = re.compile(r'\\u[0-9a-fA-F]{4}')
56
+
57
+ @staticmethod
58
+ def detect_corruption(text: str) -> Tuple[bool, int, List[str]]:
59
+ """
60
+ Detect if text contains encoding corruption patterns.
61
+
62
+ Args:
63
+ text: Text content to analyze
64
+
65
+ Returns:
66
+ Tuple of (has_corruption, corruption_count, list_of_patterns_found)
67
+ """
68
+ corruptions_found = []
69
+ patterns_found = []
70
+
71
+ for pattern in EncodingRepair.CORRUPTION_PATTERNS.keys():
72
+ matches = text.count(pattern)
73
+ if matches > 0:
74
+ corruptions_found.append(pattern)
75
+ corrected_char = EncodingRepair.CORRUPTION_PATTERNS[pattern]
76
+ patterns_found.append(f"{pattern} → {corrected_char} ({matches} occurrences)")
77
+
78
+ has_corruption = len(corruptions_found) > 0
79
+ corruption_count = sum(text.count(p) for p in corruptions_found)
80
+
81
+ return has_corruption, corruption_count, patterns_found
82
+
83
+ @staticmethod
84
+ def repair_text(text: str) -> str:
85
+ """
86
+ Repair encoding corruption in text.
87
+
88
+ Args:
89
+ text: Text content to repair
90
+
91
+ Returns:
92
+ Repaired text
93
+ """
94
+ repaired = text
95
+
96
+ for pattern, replacement in EncodingRepair.CORRUPTION_PATTERNS.items():
97
+ repaired = repaired.replace(pattern, replacement)
98
+
99
+ return repaired
100
+
101
+ @staticmethod
102
+ def repair_file(file_path: str, encoding: str = 'utf-8') -> Tuple[bool, str, Dict]:
103
+ """
104
+ Detect and repair encoding corruption in a file.
105
+
106
+ Args:
107
+ file_path: Path to the file to repair
108
+ encoding: Encoding to use when reading the file
109
+
110
+ Returns:
111
+ Tuple of (success, message, repair_info)
112
+ """
113
+ try:
114
+ file_path = Path(file_path)
115
+
116
+ # Read file
117
+ with open(file_path, 'r', encoding=encoding) as f:
118
+ original_text = f.read()
119
+
120
+ # Detect corruption
121
+ has_corruption, corruption_count, patterns = EncodingRepair.detect_corruption(original_text)
122
+
123
+ repair_info = {
124
+ 'file': str(file_path),
125
+ 'encoding': encoding,
126
+ 'has_corruption': has_corruption,
127
+ 'corruption_count': corruption_count,
128
+ 'patterns_found': patterns,
129
+ 'original_size': len(original_text),
130
+ }
131
+
132
+ if not has_corruption:
133
+ return True, "No encoding corruption detected.", repair_info
134
+
135
+ # Repair
136
+ repaired_text = EncodingRepair.repair_text(original_text)
137
+
138
+ # Write back
139
+ with open(file_path, 'w', encoding=encoding) as f:
140
+ f.write(repaired_text)
141
+
142
+ repair_info['repaired_size'] = len(repaired_text)
143
+
144
+ message = (
145
+ f"✅ File repaired successfully!\n"
146
+ f"Found and fixed {corruption_count} corruption(s) in {len(patterns)} pattern(s).\n"
147
+ f"Patterns:\n" + "\n".join(f" • {p}" for p in patterns)
148
+ )
149
+
150
+ return True, message, repair_info
151
+
152
+ except Exception as e:
153
+ return False, f"❌ Error: {str(e)}", {'error': str(e)}
154
+
155
+ @staticmethod
156
+ def repair_with_encoding_fallback(file_path: str) -> Tuple[bool, str, Dict]:
157
+ """
158
+ Try to repair a file by attempting different encodings.
159
+
160
+ This handles the case where the file itself might be in the wrong encoding.
161
+
162
+ Args:
163
+ file_path: Path to the file to repair
164
+
165
+ Returns:
166
+ Tuple of (success, message, repair_info)
167
+ """
168
+ try:
169
+ file_path = Path(file_path)
170
+
171
+ # Try to detect encoding
172
+ detected_encoding = 'utf-8'
173
+
174
+ if CHARDET_AVAILABLE:
175
+ with open(file_path, 'rb') as f:
176
+ raw_data = f.read()
177
+
178
+ detected = chardet.detect(raw_data)
179
+ detected_encoding = detected.get('encoding', 'utf-8')
180
+
181
+ # Try reading with detected encoding first
182
+ try:
183
+ with open(file_path, 'r', encoding=detected_encoding) as f:
184
+ text = f.read()
185
+ except (UnicodeDecodeError, LookupError):
186
+ # Try common encodings
187
+ for encoding in ['utf-8', 'latin-1', 'windows-1252', 'iso-8859-1']:
188
+ try:
189
+ with open(file_path, 'r', encoding=encoding) as f:
190
+ text = f.read()
191
+ detected_encoding = encoding
192
+ break
193
+ except (UnicodeDecodeError, LookupError):
194
+ continue
195
+ else:
196
+ # Fall back with error handling
197
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
198
+ text = f.read()
199
+
200
+ # Now detect corruption
201
+ has_corruption, corruption_count, patterns = EncodingRepair.detect_corruption(text)
202
+
203
+ if not has_corruption:
204
+ return True, "No encoding corruption detected.", {
205
+ 'file': str(file_path),
206
+ 'detected_encoding': detected_encoding,
207
+ 'corruption_count': 0,
208
+ }
209
+
210
+ # Repair
211
+ repaired_text = EncodingRepair.repair_text(text)
212
+
213
+ # Write back as UTF-8
214
+ with open(file_path, 'w', encoding='utf-8') as f:
215
+ f.write(repaired_text)
216
+
217
+ message = (
218
+ f"✅ File repaired successfully!\n"
219
+ f"Original encoding: {detected_encoding}\n"
220
+ f"Fixed {corruption_count} corruption(s) in {len(patterns)} pattern(s).\n"
221
+ f"Saved as UTF-8.\n"
222
+ f"Patterns:\n" + "\n".join(f" • {p}" for p in patterns)
223
+ )
224
+
225
+ return True, message, {
226
+ 'file': str(file_path),
227
+ 'detected_encoding': detected_encoding,
228
+ 'corruption_count': corruption_count,
229
+ 'patterns_found': patterns,
230
+ }
231
+
232
+ except Exception as e:
233
+ return False, f"❌ Error: {str(e)}", {'error': str(e)}
234
+
235
+ @staticmethod
236
+ def scan_directory(directory_path: str, file_extensions: List[str] = None) -> Dict:
237
+ """
238
+ Scan a directory for files with encoding corruption.
239
+
240
+ Args:
241
+ directory_path: Path to directory to scan
242
+ file_extensions: List of file extensions to check (e.g., ['.txt', '.csv'])
243
+ If None, scans all files.
244
+
245
+ Returns:
246
+ Dictionary with scan results
247
+ """
248
+ if file_extensions is None:
249
+ file_extensions = ['.txt', '.csv', '.tsv', '.md']
250
+
251
+ dir_path = Path(directory_path)
252
+ results = {
253
+ 'directory': str(dir_path),
254
+ 'files_scanned': 0,
255
+ 'files_with_corruption': [],
256
+ 'total_corruptions': 0,
257
+ }
258
+
259
+ try:
260
+ for file_path in dir_path.rglob('*'):
261
+ # Skip directories
262
+ if file_path.is_dir():
263
+ continue
264
+
265
+ # Check extension if specified
266
+ if file_extensions and file_path.suffix.lower() not in file_extensions:
267
+ continue
268
+
269
+ results['files_scanned'] += 1
270
+
271
+ try:
272
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
273
+ content = f.read()
274
+
275
+ has_corruption, count, patterns = EncodingRepair.detect_corruption(content)
276
+
277
+ if has_corruption:
278
+ results['files_with_corruption'].append({
279
+ 'file': str(file_path),
280
+ 'corruptions': count,
281
+ 'patterns': patterns,
282
+ })
283
+ results['total_corruptions'] += count
284
+
285
+ except Exception as e:
286
+ pass # Skip files that can't be read
287
+
288
+ except Exception as e:
289
+ results['error'] = str(e)
290
+
291
+ return results
292
+
293
+
294
+ # Example usage / testing
295
+ if __name__ == '__main__':
296
+ # Test text with mojibake
297
+ test_text = (
298
+ 'young concrete\u00e2\u20ac\u201cjong beton\n'
299
+ 'aggregate \u00e2\u20ac\u201c chemical impurities\u00e2\u20ac\u201ctoeslagmateriaal\n'
300
+ 'Poisson\u00e2\u20ac\u2122s ratio\u00e2\u20ac\u201ccoëfficiënt van Poisson'
301
+ )
302
+
303
+ print("Original text:")
304
+ print(test_text)
305
+ print("\n" + "="*60 + "\n")
306
+
307
+ # Detect corruption
308
+ has_corruption, count, patterns = EncodingRepair.detect_corruption(test_text)
309
+ print(f"Corruption detected: {has_corruption}")
310
+ print(f"Total corruptions: {count}")
311
+ print("Patterns found:")
312
+ for p in patterns:
313
+ print(f" • {p}")
314
+ print("\n" + "="*60 + "\n")
315
+
316
+ # Repair
317
+ repaired = EncodingRepair.repair_text(test_text)
318
+ print("Repaired text:")
319
+ print(repaired)