supervertaler 1.9.163__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- Supervertaler.py +48473 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1911 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +351 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1176 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.163.dist-info/METADATA +906 -0
- supervertaler-1.9.163.dist-info/RECORD +85 -0
- supervertaler-1.9.163.dist-info/WHEEL +5 -0
- supervertaler-1.9.163.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.163.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.163.dist-info/top_level.txt +2 -0
modules/tmx_generator.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TMX Generator Module
|
|
3
|
+
|
|
4
|
+
Helper class for generating TMX (Translation Memory eXchange) files.
|
|
5
|
+
Supports TMX 1.4 format with proper XML structure.
|
|
6
|
+
|
|
7
|
+
Extracted from main Supervertaler file for better modularity.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import xml.etree.ElementTree as ET
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_simple_lang_code(lang_name_or_code_input):
|
|
15
|
+
"""
|
|
16
|
+
Convert language name or code to ISO 639-1 format (2-letter) or ISO 639-1 + region (e.g., en-US)
|
|
17
|
+
|
|
18
|
+
Supports:
|
|
19
|
+
- Language names: "English" → "en", "Dutch" → "nl"
|
|
20
|
+
- ISO codes: "en" → "en", "nl-NL" → "nl-NL"
|
|
21
|
+
- Variants: "en-US", "nl-BE", "fr-CA" → preserved as-is
|
|
22
|
+
|
|
23
|
+
Returns base code if no variant specified, or full code with variant if provided.
|
|
24
|
+
"""
|
|
25
|
+
if not lang_name_or_code_input:
|
|
26
|
+
return "en" # Default to English
|
|
27
|
+
|
|
28
|
+
lang_input = lang_name_or_code_input.strip()
|
|
29
|
+
lang_lower = lang_input.lower()
|
|
30
|
+
|
|
31
|
+
# Comprehensive language name to ISO 639-1 mapping
|
|
32
|
+
lang_map = {
|
|
33
|
+
# Major languages
|
|
34
|
+
"english": "en",
|
|
35
|
+
"dutch": "nl",
|
|
36
|
+
"german": "de",
|
|
37
|
+
"french": "fr",
|
|
38
|
+
"spanish": "es",
|
|
39
|
+
"italian": "it",
|
|
40
|
+
"portuguese": "pt",
|
|
41
|
+
"russian": "ru",
|
|
42
|
+
"chinese": "zh",
|
|
43
|
+
"japanese": "ja",
|
|
44
|
+
"korean": "ko",
|
|
45
|
+
"arabic": "ar",
|
|
46
|
+
|
|
47
|
+
# European languages
|
|
48
|
+
"afrikaans": "af",
|
|
49
|
+
"albanian": "sq",
|
|
50
|
+
"armenian": "hy",
|
|
51
|
+
"basque": "eu",
|
|
52
|
+
"bengali": "bn",
|
|
53
|
+
"bulgarian": "bg",
|
|
54
|
+
"catalan": "ca",
|
|
55
|
+
"croatian": "hr",
|
|
56
|
+
"czech": "cs",
|
|
57
|
+
"danish": "da",
|
|
58
|
+
"estonian": "et",
|
|
59
|
+
"finnish": "fi",
|
|
60
|
+
"galician": "gl",
|
|
61
|
+
"georgian": "ka",
|
|
62
|
+
"greek": "el",
|
|
63
|
+
"hebrew": "he",
|
|
64
|
+
"hindi": "hi",
|
|
65
|
+
"hungarian": "hu",
|
|
66
|
+
"icelandic": "is",
|
|
67
|
+
"indonesian": "id",
|
|
68
|
+
"irish": "ga",
|
|
69
|
+
"latvian": "lv",
|
|
70
|
+
"lithuanian": "lt",
|
|
71
|
+
"macedonian": "mk",
|
|
72
|
+
"malay": "ms",
|
|
73
|
+
"norwegian": "no",
|
|
74
|
+
"persian": "fa",
|
|
75
|
+
"polish": "pl",
|
|
76
|
+
"romanian": "ro",
|
|
77
|
+
"serbian": "sr",
|
|
78
|
+
"slovak": "sk",
|
|
79
|
+
"slovenian": "sl",
|
|
80
|
+
"swahili": "sw",
|
|
81
|
+
"swedish": "sv",
|
|
82
|
+
"thai": "th",
|
|
83
|
+
"turkish": "tr",
|
|
84
|
+
"ukrainian": "uk",
|
|
85
|
+
"urdu": "ur",
|
|
86
|
+
"vietnamese": "vi",
|
|
87
|
+
"welsh": "cy",
|
|
88
|
+
|
|
89
|
+
# Chinese variants
|
|
90
|
+
"chinese (simplified)": "zh-CN",
|
|
91
|
+
"chinese (traditional)": "zh-TW",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Check if it's a full language name
|
|
95
|
+
if lang_lower in lang_map:
|
|
96
|
+
return lang_map[lang_lower]
|
|
97
|
+
|
|
98
|
+
# Check if already ISO code (2-letter or with variant)
|
|
99
|
+
# Examples: "en", "en-US", "nl-NL", "fr-CA"
|
|
100
|
+
if '-' in lang_input or '_' in lang_input:
|
|
101
|
+
# Has variant - preserve it
|
|
102
|
+
parts = lang_input.replace('_', '-').split('-')
|
|
103
|
+
if len(parts[0]) == 2:
|
|
104
|
+
# Valid format like "en-US"
|
|
105
|
+
return f"{parts[0].lower()}-{parts[1].upper()}"
|
|
106
|
+
|
|
107
|
+
# Extract base code if it looks like an ISO code
|
|
108
|
+
base_code = lang_lower.split('-')[0].split('_')[0]
|
|
109
|
+
if len(base_code) == 2 and base_code.isalpha():
|
|
110
|
+
return base_code
|
|
111
|
+
|
|
112
|
+
# Fallback: return first 2 characters or default
|
|
113
|
+
if len(lang_input) >= 2:
|
|
114
|
+
return lang_input[:2].lower()
|
|
115
|
+
|
|
116
|
+
return "en" # Ultimate fallback
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def get_base_lang_code(lang_code: str) -> str:
|
|
120
|
+
"""Extract base language code from variant (e.g., 'en-US' → 'en', 'nl-BE' → 'nl', 'Dutch' → 'nl')"""
|
|
121
|
+
if not lang_code:
|
|
122
|
+
return "en"
|
|
123
|
+
|
|
124
|
+
# First convert full language names to ISO codes
|
|
125
|
+
iso_code = get_simple_lang_code(lang_code)
|
|
126
|
+
|
|
127
|
+
# Then extract base code from variant
|
|
128
|
+
return iso_code.split('-')[0].split('_')[0].lower()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def get_lang_match_variants(lang_code: str) -> list:
|
|
132
|
+
"""
|
|
133
|
+
Get all possible string variants for matching a language in database queries.
|
|
134
|
+
|
|
135
|
+
Returns list of strings that could be used to match this language, including:
|
|
136
|
+
- Base ISO code (e.g., 'nl', 'en')
|
|
137
|
+
- Full language names (e.g., 'Dutch', 'English')
|
|
138
|
+
- Common variants (e.g., 'nl-NL', 'en-US')
|
|
139
|
+
|
|
140
|
+
This helps match database entries that may have inconsistent language formats.
|
|
141
|
+
"""
|
|
142
|
+
if not lang_code:
|
|
143
|
+
return ['en', 'English']
|
|
144
|
+
|
|
145
|
+
# Reverse mapping from ISO codes to full names
|
|
146
|
+
code_to_name = {
|
|
147
|
+
"en": "English",
|
|
148
|
+
"nl": "Dutch",
|
|
149
|
+
"de": "German",
|
|
150
|
+
"fr": "French",
|
|
151
|
+
"es": "Spanish",
|
|
152
|
+
"it": "Italian",
|
|
153
|
+
"pt": "Portuguese",
|
|
154
|
+
"ru": "Russian",
|
|
155
|
+
"zh": "Chinese",
|
|
156
|
+
"ja": "Japanese",
|
|
157
|
+
"ko": "Korean",
|
|
158
|
+
"ar": "Arabic",
|
|
159
|
+
"pl": "Polish",
|
|
160
|
+
"sv": "Swedish",
|
|
161
|
+
"da": "Danish",
|
|
162
|
+
"no": "Norwegian",
|
|
163
|
+
"fi": "Finnish",
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
# Get the base ISO code
|
|
167
|
+
base_code = get_base_lang_code(lang_code)
|
|
168
|
+
|
|
169
|
+
variants = [base_code]
|
|
170
|
+
|
|
171
|
+
# Add full language name if we know it
|
|
172
|
+
if base_code in code_to_name:
|
|
173
|
+
variants.append(code_to_name[base_code])
|
|
174
|
+
|
|
175
|
+
return variants
|
|
176
|
+
|
|
177
|
+
def normalize_lang_variant(lang_code: str) -> str:
|
|
178
|
+
"""Normalize language variant to lowercase-UPPERCASE format (e.g., 'en-us' → 'en-US', 'nl-be' → 'nl-BE').
|
|
179
|
+
|
|
180
|
+
Handles various input formats:
|
|
181
|
+
- nl-nl → nl-NL
|
|
182
|
+
- nl-NL → nl-NL
|
|
183
|
+
- NL-NL → nl-NL
|
|
184
|
+
- nl_BE → nl-BE
|
|
185
|
+
- nl → nl (base code unchanged)
|
|
186
|
+
"""
|
|
187
|
+
if not lang_code:
|
|
188
|
+
return lang_code
|
|
189
|
+
|
|
190
|
+
# Replace underscores with hyphens
|
|
191
|
+
lang_code = lang_code.replace('_', '-')
|
|
192
|
+
|
|
193
|
+
parts = lang_code.split('-')
|
|
194
|
+
if len(parts) == 1:
|
|
195
|
+
# Base language code only (e.g., 'nl', 'en')
|
|
196
|
+
return parts[0].lower()
|
|
197
|
+
elif len(parts) == 2:
|
|
198
|
+
# Language variant (e.g., 'en-US', 'nl-BE')
|
|
199
|
+
return f"{parts[0].lower()}-{parts[1].upper()}"
|
|
200
|
+
else:
|
|
201
|
+
# Unexpected format, just lowercase the first part
|
|
202
|
+
return parts[0].lower()
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def languages_are_compatible(lang1: str, lang2: str) -> bool:
|
|
206
|
+
"""Check if two language codes are compatible (same base language)"""
|
|
207
|
+
return get_base_lang_code(lang1) == get_base_lang_code(lang2)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class TMXGenerator:
|
|
211
|
+
"""Helper class for generating TMX (Translation Memory eXchange) files"""
|
|
212
|
+
|
|
213
|
+
def __init__(self, log_callback=None):
|
|
214
|
+
self.log = log_callback if log_callback else lambda msg: None
|
|
215
|
+
|
|
216
|
+
def generate_tmx(self, source_segments, target_segments, source_lang, target_lang):
|
|
217
|
+
"""Generate TMX content from parallel segments"""
|
|
218
|
+
# Basic TMX structure
|
|
219
|
+
tmx = ET.Element('tmx')
|
|
220
|
+
tmx.set('version', '1.4')
|
|
221
|
+
|
|
222
|
+
header = ET.SubElement(tmx, 'header')
|
|
223
|
+
header.set('creationdate', datetime.now().strftime('%Y%m%dT%H%M%SZ'))
|
|
224
|
+
header.set('srclang', get_simple_lang_code(source_lang))
|
|
225
|
+
header.set('adminlang', 'en')
|
|
226
|
+
header.set('segtype', 'sentence')
|
|
227
|
+
header.set('creationtool', 'Supervertaler')
|
|
228
|
+
header.set('creationtoolversion', '3.6.0-beta')
|
|
229
|
+
header.set('datatype', 'plaintext')
|
|
230
|
+
|
|
231
|
+
body = ET.SubElement(tmx, 'body')
|
|
232
|
+
|
|
233
|
+
# Add translation units
|
|
234
|
+
added_count = 0
|
|
235
|
+
for src, tgt in zip(source_segments, target_segments):
|
|
236
|
+
if not src.strip() or not tgt or '[ERR' in str(tgt) or '[Missing' in str(tgt):
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
tu = ET.SubElement(body, 'tu')
|
|
240
|
+
|
|
241
|
+
# Source segment
|
|
242
|
+
tuv_src = ET.SubElement(tu, 'tuv')
|
|
243
|
+
tuv_src.set('xml:lang', get_simple_lang_code(source_lang))
|
|
244
|
+
seg_src = ET.SubElement(tuv_src, 'seg')
|
|
245
|
+
seg_src.text = src.strip()
|
|
246
|
+
|
|
247
|
+
# Target segment
|
|
248
|
+
tuv_tgt = ET.SubElement(tu, 'tuv')
|
|
249
|
+
tuv_tgt.set('xml:lang', get_simple_lang_code(target_lang))
|
|
250
|
+
seg_tgt = ET.SubElement(tuv_tgt, 'seg')
|
|
251
|
+
seg_tgt.text = str(tgt).strip()
|
|
252
|
+
|
|
253
|
+
added_count += 1
|
|
254
|
+
|
|
255
|
+
self.log(f"[TMX Generator] Created TMX with {added_count} translation units")
|
|
256
|
+
return ET.ElementTree(tmx)
|
|
257
|
+
|
|
258
|
+
def save_tmx(self, tmx_tree, output_path):
|
|
259
|
+
"""Save TMX tree to file with proper XML formatting"""
|
|
260
|
+
try:
|
|
261
|
+
# Pretty print with indentation
|
|
262
|
+
self._indent(tmx_tree.getroot())
|
|
263
|
+
tmx_tree.write(output_path, encoding='utf-8', xml_declaration=True)
|
|
264
|
+
self.log(f"[TMX Generator] Saved TMX file: {output_path}")
|
|
265
|
+
return True
|
|
266
|
+
except Exception as e:
|
|
267
|
+
self.log(f"[TMX Generator] Error saving TMX: {e}")
|
|
268
|
+
return False
|
|
269
|
+
|
|
270
|
+
def _indent(self, elem, level=0):
|
|
271
|
+
"""Add indentation to XML for pretty printing"""
|
|
272
|
+
i = "\n" + level * " "
|
|
273
|
+
if len(elem):
|
|
274
|
+
if not elem.text or not elem.text.strip():
|
|
275
|
+
elem.text = i + " "
|
|
276
|
+
if not elem.tail or not elem.tail.strip():
|
|
277
|
+
elem.tail = i
|
|
278
|
+
for child in elem:
|
|
279
|
+
self._indent(child, level + 1)
|
|
280
|
+
if not child.tail or not child.tail.strip():
|
|
281
|
+
child.tail = i
|
|
282
|
+
else:
|
|
283
|
+
if level and (not elem.tail or not elem.tail.strip()):
|
|
284
|
+
elem.tail = i
|