supervertaler 1.9.163__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- Supervertaler.py +48473 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1911 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +351 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1176 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.163.dist-info/METADATA +906 -0
- supervertaler-1.9.163.dist-info/RECORD +85 -0
- supervertaler-1.9.163.dist-info/WHEEL +5 -0
- supervertaler-1.9.163.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.163.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.163.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text Encoding Corruption Detection and Repair Module
|
|
3
|
+
|
|
4
|
+
Detects and fixes common text encoding issues (mojibake), particularly:
|
|
5
|
+
- UTF-8 text incorrectly decoded as Latin-1 (Windows-1252)
|
|
6
|
+
- Double-encoded Unicode escape sequences
|
|
7
|
+
- Common encoding corruption patterns
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Tuple, List, Dict
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import chardet
|
|
16
|
+
CHARDET_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
CHARDET_AVAILABLE = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EncodingRepair:
|
|
22
|
+
"""Detect and repair text encoding corruption."""
|
|
23
|
+
|
|
24
|
+
# Common mojibake patterns (UTF-8 misinterpreted as Latin-1)
|
|
25
|
+
CORRUPTION_PATTERNS = {
|
|
26
|
+
# En dash, em dash, hyphen variants (as literal strings, not Unicode escapes)
|
|
27
|
+
'\\u00e2\\u20ac\\u201c': '–', # en dash
|
|
28
|
+
'\\u00e2\\u20ac\\u201d': '—', # em dash
|
|
29
|
+
'\\u00e2\\u20ac\\u0090': '-', # non-breaking hyphen (3-char sequence)
|
|
30
|
+
|
|
31
|
+
# Standalone Unicode escape sequences (single occurrence)
|
|
32
|
+
'-\\u0090': '-', # hyphen + corruption → single hyphen
|
|
33
|
+
'\\u0090': '', # standalone corruption → remove it
|
|
34
|
+
|
|
35
|
+
# Quotes and apostrophes
|
|
36
|
+
'\\u00e2\\u20ac\\u0153': '"', # left double quote
|
|
37
|
+
'\\u00e2\\u20ac\\u009d': '"', # right double quote
|
|
38
|
+
'\\u00e2\\u20ac\\u0098': '\u2018', # left single quote
|
|
39
|
+
'\\u00e2\\u20ac\\u0099': '\u2019', # right single quote
|
|
40
|
+
'\\u00e2\\u20ac\\u2122': '\u2019', # apostrophe/right single quote
|
|
41
|
+
|
|
42
|
+
# Ellipsis and other punctuation
|
|
43
|
+
'\\u00e2\\u20ac\\u00a6': '…', # ellipsis
|
|
44
|
+
'\\u00e2\\u20ac\\u00a2': '•', # bullet
|
|
45
|
+
|
|
46
|
+
# Spaces
|
|
47
|
+
'\\u00c2\\u00a0': ' ', # non-breaking space
|
|
48
|
+
|
|
49
|
+
# Degree and special symbols
|
|
50
|
+
'\\u00c2\\u00b0': '°', # degree
|
|
51
|
+
'\\u00c3\\u00a9': 'é', # e acute (example)
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Regex pattern to find potential corruption sequences
|
|
55
|
+
UNICODE_ESCAPE_PATTERN = re.compile(r'\\u[0-9a-fA-F]{4}')
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def detect_corruption(text: str) -> Tuple[bool, int, List[str]]:
|
|
59
|
+
"""
|
|
60
|
+
Detect if text contains encoding corruption patterns.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
text: Text content to analyze
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Tuple of (has_corruption, corruption_count, list_of_patterns_found)
|
|
67
|
+
"""
|
|
68
|
+
corruptions_found = []
|
|
69
|
+
patterns_found = []
|
|
70
|
+
|
|
71
|
+
for pattern in EncodingRepair.CORRUPTION_PATTERNS.keys():
|
|
72
|
+
matches = text.count(pattern)
|
|
73
|
+
if matches > 0:
|
|
74
|
+
corruptions_found.append(pattern)
|
|
75
|
+
corrected_char = EncodingRepair.CORRUPTION_PATTERNS[pattern]
|
|
76
|
+
patterns_found.append(f"{pattern} → {corrected_char} ({matches} occurrences)")
|
|
77
|
+
|
|
78
|
+
has_corruption = len(corruptions_found) > 0
|
|
79
|
+
corruption_count = sum(text.count(p) for p in corruptions_found)
|
|
80
|
+
|
|
81
|
+
return has_corruption, corruption_count, patterns_found
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def repair_text(text: str) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Repair encoding corruption in text.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
text: Text content to repair
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Repaired text
|
|
93
|
+
"""
|
|
94
|
+
repaired = text
|
|
95
|
+
|
|
96
|
+
for pattern, replacement in EncodingRepair.CORRUPTION_PATTERNS.items():
|
|
97
|
+
repaired = repaired.replace(pattern, replacement)
|
|
98
|
+
|
|
99
|
+
return repaired
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
def repair_file(file_path: str, encoding: str = 'utf-8') -> Tuple[bool, str, Dict]:
|
|
103
|
+
"""
|
|
104
|
+
Detect and repair encoding corruption in a file.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
file_path: Path to the file to repair
|
|
108
|
+
encoding: Encoding to use when reading the file
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Tuple of (success, message, repair_info)
|
|
112
|
+
"""
|
|
113
|
+
try:
|
|
114
|
+
file_path = Path(file_path)
|
|
115
|
+
|
|
116
|
+
# Read file
|
|
117
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
118
|
+
original_text = f.read()
|
|
119
|
+
|
|
120
|
+
# Detect corruption
|
|
121
|
+
has_corruption, corruption_count, patterns = EncodingRepair.detect_corruption(original_text)
|
|
122
|
+
|
|
123
|
+
repair_info = {
|
|
124
|
+
'file': str(file_path),
|
|
125
|
+
'encoding': encoding,
|
|
126
|
+
'has_corruption': has_corruption,
|
|
127
|
+
'corruption_count': corruption_count,
|
|
128
|
+
'patterns_found': patterns,
|
|
129
|
+
'original_size': len(original_text),
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if not has_corruption:
|
|
133
|
+
return True, "No encoding corruption detected.", repair_info
|
|
134
|
+
|
|
135
|
+
# Repair
|
|
136
|
+
repaired_text = EncodingRepair.repair_text(original_text)
|
|
137
|
+
|
|
138
|
+
# Write back
|
|
139
|
+
with open(file_path, 'w', encoding=encoding) as f:
|
|
140
|
+
f.write(repaired_text)
|
|
141
|
+
|
|
142
|
+
repair_info['repaired_size'] = len(repaired_text)
|
|
143
|
+
|
|
144
|
+
message = (
|
|
145
|
+
f"✅ File repaired successfully!\n"
|
|
146
|
+
f"Found and fixed {corruption_count} corruption(s) in {len(patterns)} pattern(s).\n"
|
|
147
|
+
f"Patterns:\n" + "\n".join(f" • {p}" for p in patterns)
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return True, message, repair_info
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
return False, f"❌ Error: {str(e)}", {'error': str(e)}
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def repair_with_encoding_fallback(file_path: str) -> Tuple[bool, str, Dict]:
|
|
157
|
+
"""
|
|
158
|
+
Try to repair a file by attempting different encodings.
|
|
159
|
+
|
|
160
|
+
This handles the case where the file itself might be in the wrong encoding.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
file_path: Path to the file to repair
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Tuple of (success, message, repair_info)
|
|
167
|
+
"""
|
|
168
|
+
try:
|
|
169
|
+
file_path = Path(file_path)
|
|
170
|
+
|
|
171
|
+
# Try to detect encoding
|
|
172
|
+
detected_encoding = 'utf-8'
|
|
173
|
+
|
|
174
|
+
if CHARDET_AVAILABLE:
|
|
175
|
+
with open(file_path, 'rb') as f:
|
|
176
|
+
raw_data = f.read()
|
|
177
|
+
|
|
178
|
+
detected = chardet.detect(raw_data)
|
|
179
|
+
detected_encoding = detected.get('encoding', 'utf-8')
|
|
180
|
+
|
|
181
|
+
# Try reading with detected encoding first
|
|
182
|
+
try:
|
|
183
|
+
with open(file_path, 'r', encoding=detected_encoding) as f:
|
|
184
|
+
text = f.read()
|
|
185
|
+
except (UnicodeDecodeError, LookupError):
|
|
186
|
+
# Try common encodings
|
|
187
|
+
for encoding in ['utf-8', 'latin-1', 'windows-1252', 'iso-8859-1']:
|
|
188
|
+
try:
|
|
189
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
190
|
+
text = f.read()
|
|
191
|
+
detected_encoding = encoding
|
|
192
|
+
break
|
|
193
|
+
except (UnicodeDecodeError, LookupError):
|
|
194
|
+
continue
|
|
195
|
+
else:
|
|
196
|
+
# Fall back with error handling
|
|
197
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
198
|
+
text = f.read()
|
|
199
|
+
|
|
200
|
+
# Now detect corruption
|
|
201
|
+
has_corruption, corruption_count, patterns = EncodingRepair.detect_corruption(text)
|
|
202
|
+
|
|
203
|
+
if not has_corruption:
|
|
204
|
+
return True, "No encoding corruption detected.", {
|
|
205
|
+
'file': str(file_path),
|
|
206
|
+
'detected_encoding': detected_encoding,
|
|
207
|
+
'corruption_count': 0,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# Repair
|
|
211
|
+
repaired_text = EncodingRepair.repair_text(text)
|
|
212
|
+
|
|
213
|
+
# Write back as UTF-8
|
|
214
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
215
|
+
f.write(repaired_text)
|
|
216
|
+
|
|
217
|
+
message = (
|
|
218
|
+
f"✅ File repaired successfully!\n"
|
|
219
|
+
f"Original encoding: {detected_encoding}\n"
|
|
220
|
+
f"Fixed {corruption_count} corruption(s) in {len(patterns)} pattern(s).\n"
|
|
221
|
+
f"Saved as UTF-8.\n"
|
|
222
|
+
f"Patterns:\n" + "\n".join(f" • {p}" for p in patterns)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return True, message, {
|
|
226
|
+
'file': str(file_path),
|
|
227
|
+
'detected_encoding': detected_encoding,
|
|
228
|
+
'corruption_count': corruption_count,
|
|
229
|
+
'patterns_found': patterns,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
except Exception as e:
|
|
233
|
+
return False, f"❌ Error: {str(e)}", {'error': str(e)}
|
|
234
|
+
|
|
235
|
+
@staticmethod
|
|
236
|
+
def scan_directory(directory_path: str, file_extensions: List[str] = None) -> Dict:
|
|
237
|
+
"""
|
|
238
|
+
Scan a directory for files with encoding corruption.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
directory_path: Path to directory to scan
|
|
242
|
+
file_extensions: List of file extensions to check (e.g., ['.txt', '.csv'])
|
|
243
|
+
If None, scans all files.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Dictionary with scan results
|
|
247
|
+
"""
|
|
248
|
+
if file_extensions is None:
|
|
249
|
+
file_extensions = ['.txt', '.csv', '.tsv', '.md']
|
|
250
|
+
|
|
251
|
+
dir_path = Path(directory_path)
|
|
252
|
+
results = {
|
|
253
|
+
'directory': str(dir_path),
|
|
254
|
+
'files_scanned': 0,
|
|
255
|
+
'files_with_corruption': [],
|
|
256
|
+
'total_corruptions': 0,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
for file_path in dir_path.rglob('*'):
|
|
261
|
+
# Skip directories
|
|
262
|
+
if file_path.is_dir():
|
|
263
|
+
continue
|
|
264
|
+
|
|
265
|
+
# Check extension if specified
|
|
266
|
+
if file_extensions and file_path.suffix.lower() not in file_extensions:
|
|
267
|
+
continue
|
|
268
|
+
|
|
269
|
+
results['files_scanned'] += 1
|
|
270
|
+
|
|
271
|
+
try:
|
|
272
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
273
|
+
content = f.read()
|
|
274
|
+
|
|
275
|
+
has_corruption, count, patterns = EncodingRepair.detect_corruption(content)
|
|
276
|
+
|
|
277
|
+
if has_corruption:
|
|
278
|
+
results['files_with_corruption'].append({
|
|
279
|
+
'file': str(file_path),
|
|
280
|
+
'corruptions': count,
|
|
281
|
+
'patterns': patterns,
|
|
282
|
+
})
|
|
283
|
+
results['total_corruptions'] += count
|
|
284
|
+
|
|
285
|
+
except Exception as e:
|
|
286
|
+
pass # Skip files that can't be read
|
|
287
|
+
|
|
288
|
+
except Exception as e:
|
|
289
|
+
results['error'] = str(e)
|
|
290
|
+
|
|
291
|
+
return results
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
# Example usage / testing
|
|
295
|
+
if __name__ == '__main__':
|
|
296
|
+
# Test text with mojibake
|
|
297
|
+
test_text = (
|
|
298
|
+
'young concrete\u00e2\u20ac\u201cjong beton\n'
|
|
299
|
+
'aggregate \u00e2\u20ac\u201c chemical impurities\u00e2\u20ac\u201ctoeslagmateriaal\n'
|
|
300
|
+
'Poisson\u00e2\u20ac\u2122s ratio\u00e2\u20ac\u201ccoëfficiënt van Poisson'
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
print("Original text:")
|
|
304
|
+
print(test_text)
|
|
305
|
+
print("\n" + "="*60 + "\n")
|
|
306
|
+
|
|
307
|
+
# Detect corruption
|
|
308
|
+
has_corruption, count, patterns = EncodingRepair.detect_corruption(test_text)
|
|
309
|
+
print(f"Corruption detected: {has_corruption}")
|
|
310
|
+
print(f"Total corruptions: {count}")
|
|
311
|
+
print("Patterns found:")
|
|
312
|
+
for p in patterns:
|
|
313
|
+
print(f" • {p}")
|
|
314
|
+
print("\n" + "="*60 + "\n")
|
|
315
|
+
|
|
316
|
+
# Repair
|
|
317
|
+
repaired = EncodingRepair.repair_text(test_text)
|
|
318
|
+
print("Repaired text:")
|
|
319
|
+
print(repaired)
|