supervertaler 1.9.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +47886 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1878 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +333 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1172 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.153.dist-info/METADATA +896 -0
- supervertaler-1.9.153.dist-info/RECORD +85 -0
- supervertaler-1.9.153.dist-info/WHEEL +5 -0
- supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.153.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,638 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MQXLIFF Handler Module
|
|
3
|
+
======================
|
|
4
|
+
Handles import/export of memoQ XLIFF (.mqxliff) files with proper formatting preservation.
|
|
5
|
+
|
|
6
|
+
MQXLIFF is an XLIFF 1.2 format with memoQ-specific extensions for CAT tool metadata
|
|
7
|
+
and formatting tags. This module provides robust parsing and generation of MQXLIFF files
|
|
8
|
+
while preserving inline formatting (bold, italic, underline) and complex structures like
|
|
9
|
+
hyperlinks.
|
|
10
|
+
|
|
11
|
+
Key Features:
|
|
12
|
+
- Parse XLIFF trans-units with source and target segments
|
|
13
|
+
- Extract and preserve inline formatting tags (bpt/ept pairs)
|
|
14
|
+
- Handle complex nested structures (hyperlinks with formatting)
|
|
15
|
+
- Generate valid MQXLIFF output with proper tag structure
|
|
16
|
+
- Maintain segment IDs and memoQ metadata
|
|
17
|
+
|
|
18
|
+
Formatting Tag Structure:
|
|
19
|
+
- <bpt id="X" ctype="bold">{}</bpt>...<ept id="X">{}</ept> - Bold text
|
|
20
|
+
- <bpt id="X" ctype="italic">{}</bpt>...<ept id="X">{}</ept> - Italic text
|
|
21
|
+
- <bpt id="X" ctype="underlined">{}</bpt>...<ept id="X">{}</ept> - Underlined text
|
|
22
|
+
- Nested tags for hyperlinks: <bpt><bpt><bpt>text</ept></ept></ept>
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import xml.etree.ElementTree as ET
|
|
26
|
+
from typing import List, Dict, Tuple, Optional
|
|
27
|
+
import re
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class FormattedSegment:
|
|
31
|
+
"""Represents a segment with inline formatting information."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, segment_id: str, plain_text: str, formatted_xml: str):
|
|
34
|
+
"""
|
|
35
|
+
Initialize a formatted segment.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
segment_id: Unique identifier for the segment (trans-unit id)
|
|
39
|
+
plain_text: Plain text without any formatting tags
|
|
40
|
+
formatted_xml: XML string with formatting tags preserved
|
|
41
|
+
"""
|
|
42
|
+
self.id = segment_id
|
|
43
|
+
self.plain_text = plain_text
|
|
44
|
+
self.formatted_xml = formatted_xml
|
|
45
|
+
self.formatting_tags = self._extract_formatting_tags(formatted_xml)
|
|
46
|
+
|
|
47
|
+
def _extract_formatting_tags(self, xml_str: str) -> List[Dict]:
|
|
48
|
+
"""Extract formatting tag information from XML string."""
|
|
49
|
+
tags = []
|
|
50
|
+
# Match bpt tags with ctype attribute
|
|
51
|
+
bpt_pattern = r'<bpt\s+id="(\d+)"\s+(?:rid="(\d+)"\s+)?ctype="([^"]+)">[^<]*</bpt>'
|
|
52
|
+
for match in re.finditer(bpt_pattern, xml_str):
|
|
53
|
+
tag_id = match.group(1)
|
|
54
|
+
ctype = match.group(3)
|
|
55
|
+
tags.append({
|
|
56
|
+
'id': tag_id,
|
|
57
|
+
'type': ctype,
|
|
58
|
+
'is_bpt': True
|
|
59
|
+
})
|
|
60
|
+
return tags
|
|
61
|
+
|
|
62
|
+
def __repr__(self):
|
|
63
|
+
return f"FormattedSegment(id={self.id}, text='{self.plain_text[:50]}...', tags={len(self.formatting_tags)})"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class MQXLIFFHandler:
|
|
67
|
+
"""Handler for parsing and generating memoQ XLIFF files."""
|
|
68
|
+
|
|
69
|
+
# Namespaces used in MQXLIFF files
|
|
70
|
+
NAMESPACES = {
|
|
71
|
+
'xliff': 'urn:oasis:names:tc:xliff:document:1.2',
|
|
72
|
+
'mq': 'MQXliff'
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
def __init__(self):
|
|
76
|
+
"""Initialize the MQXLIFF handler."""
|
|
77
|
+
self.tree = None
|
|
78
|
+
self.root = None
|
|
79
|
+
self.file_element = None
|
|
80
|
+
self.body_element = None
|
|
81
|
+
self.source_lang = None
|
|
82
|
+
self.target_lang = None
|
|
83
|
+
|
|
84
|
+
def load(self, file_path: str) -> bool:
|
|
85
|
+
"""
|
|
86
|
+
Load and parse an MQXLIFF file.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
file_path: Path to the .mqxliff file
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
True if loaded successfully, False otherwise
|
|
93
|
+
"""
|
|
94
|
+
try:
|
|
95
|
+
# Register namespaces for proper parsing
|
|
96
|
+
for prefix, uri in self.NAMESPACES.items():
|
|
97
|
+
ET.register_namespace(prefix, uri)
|
|
98
|
+
|
|
99
|
+
self.tree = ET.parse(file_path)
|
|
100
|
+
self.root = self.tree.getroot()
|
|
101
|
+
|
|
102
|
+
# Find the file element
|
|
103
|
+
self.file_element = self.root.find('.//xliff:file', self.NAMESPACES)
|
|
104
|
+
if self.file_element is None:
|
|
105
|
+
# Try without namespace
|
|
106
|
+
self.file_element = self.root.find('.//file')
|
|
107
|
+
|
|
108
|
+
if self.file_element is not None:
|
|
109
|
+
self.source_lang = self.file_element.get('source-language', 'unknown')
|
|
110
|
+
self.target_lang = self.file_element.get('target-language', 'unknown')
|
|
111
|
+
|
|
112
|
+
# Find the body element
|
|
113
|
+
self.body_element = self.root.find('.//xliff:body', self.NAMESPACES)
|
|
114
|
+
if self.body_element is None:
|
|
115
|
+
# Try without namespace
|
|
116
|
+
self.body_element = self.root.find('.//body')
|
|
117
|
+
|
|
118
|
+
return True
|
|
119
|
+
except Exception as e:
|
|
120
|
+
print(f"[MQXLIFF] Error loading file: {e}")
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
def extract_source_segments(self) -> List[FormattedSegment]:
|
|
124
|
+
"""
|
|
125
|
+
Extract all source segments from the MQXLIFF file.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
List of FormattedSegment objects containing source text and formatting
|
|
129
|
+
"""
|
|
130
|
+
segments = []
|
|
131
|
+
|
|
132
|
+
if self.body_element is None:
|
|
133
|
+
return segments
|
|
134
|
+
|
|
135
|
+
# Find all trans-unit elements (with or without namespace)
|
|
136
|
+
trans_units = self.body_element.findall('.//xliff:trans-unit', self.NAMESPACES)
|
|
137
|
+
if not trans_units:
|
|
138
|
+
trans_units = self.body_element.findall('.//trans-unit')
|
|
139
|
+
|
|
140
|
+
for trans_unit in trans_units:
|
|
141
|
+
trans_unit_id = trans_unit.get('id', 'unknown')
|
|
142
|
+
|
|
143
|
+
# Skip auxiliary segments (like hyperlink URLs with mq:nosplitjoin="true")
|
|
144
|
+
nosplitjoin = trans_unit.get('{MQXliff}nosplitjoin', 'false')
|
|
145
|
+
if nosplitjoin == 'true':
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
# Find source element
|
|
149
|
+
source_elem = trans_unit.find('xliff:source', self.NAMESPACES)
|
|
150
|
+
if source_elem is None:
|
|
151
|
+
source_elem = trans_unit.find('source')
|
|
152
|
+
|
|
153
|
+
if source_elem is not None:
|
|
154
|
+
# Get the XML string of the source element's content
|
|
155
|
+
formatted_xml = ET.tostring(source_elem, encoding='unicode', method='xml')
|
|
156
|
+
|
|
157
|
+
# Extract plain text (removing all tags)
|
|
158
|
+
plain_text = self._extract_plain_text(source_elem)
|
|
159
|
+
|
|
160
|
+
segment = FormattedSegment(trans_unit_id, plain_text, formatted_xml)
|
|
161
|
+
segments.append(segment)
|
|
162
|
+
|
|
163
|
+
return segments
|
|
164
|
+
|
|
165
|
+
def _extract_plain_text(self, element: ET.Element) -> str:
|
|
166
|
+
"""
|
|
167
|
+
Recursively extract plain text from an XML element, stripping all tags.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
element: The XML element to extract text from
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Plain text string with all tags removed (including {} placeholders)
|
|
174
|
+
"""
|
|
175
|
+
text_parts = []
|
|
176
|
+
|
|
177
|
+
# Add the element's text
|
|
178
|
+
if element.text:
|
|
179
|
+
text_parts.append(element.text)
|
|
180
|
+
|
|
181
|
+
# Recursively process child elements
|
|
182
|
+
for child in element:
|
|
183
|
+
text_parts.append(self._extract_plain_text(child))
|
|
184
|
+
# Add the tail text (text after the child element's closing tag)
|
|
185
|
+
if child.tail:
|
|
186
|
+
text_parts.append(child.tail)
|
|
187
|
+
|
|
188
|
+
full_text = ''.join(text_parts)
|
|
189
|
+
|
|
190
|
+
# Remove {} placeholders that come from bpt/ept tags
|
|
191
|
+
# These are used in MQXLIFF to mark tag positions
|
|
192
|
+
full_text = full_text.replace('{}', '')
|
|
193
|
+
|
|
194
|
+
return full_text
|
|
195
|
+
|
|
196
|
+
def update_target_segments(self, translations: List[str]) -> int:
|
|
197
|
+
"""
|
|
198
|
+
Update target segments in the MQXLIFF with translations.
|
|
199
|
+
|
|
200
|
+
This method attempts to preserve formatting from the source segment by:
|
|
201
|
+
1. Copying the source formatting structure
|
|
202
|
+
2. Replacing the text content with the translation
|
|
203
|
+
3. Adjusting tag IDs to avoid conflicts
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
translations: List of translated strings (plain text)
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Number of segments updated
|
|
210
|
+
"""
|
|
211
|
+
if self.body_element is None:
|
|
212
|
+
return 0
|
|
213
|
+
|
|
214
|
+
# Find all trans-unit elements
|
|
215
|
+
trans_units = self.body_element.findall('.//xliff:trans-unit', self.NAMESPACES)
|
|
216
|
+
if not trans_units:
|
|
217
|
+
trans_units = self.body_element.findall('.//trans-unit')
|
|
218
|
+
|
|
219
|
+
translation_idx = 0
|
|
220
|
+
segments_updated = 0
|
|
221
|
+
|
|
222
|
+
for trans_unit in trans_units:
|
|
223
|
+
# Skip auxiliary segments
|
|
224
|
+
nosplitjoin = trans_unit.get('{MQXliff}nosplitjoin', 'false')
|
|
225
|
+
if nosplitjoin == 'true':
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
if translation_idx >= len(translations):
|
|
229
|
+
break
|
|
230
|
+
|
|
231
|
+
translation = translations[translation_idx]
|
|
232
|
+
translation_idx += 1
|
|
233
|
+
|
|
234
|
+
# Find source and target elements
|
|
235
|
+
source_elem = trans_unit.find('xliff:source', self.NAMESPACES)
|
|
236
|
+
if source_elem is None:
|
|
237
|
+
source_elem = trans_unit.find('source')
|
|
238
|
+
|
|
239
|
+
target_elem = trans_unit.find('xliff:target', self.NAMESPACES)
|
|
240
|
+
if target_elem is None:
|
|
241
|
+
target_elem = trans_unit.find('target')
|
|
242
|
+
|
|
243
|
+
if source_elem is not None and target_elem is not None:
|
|
244
|
+
# Copy formatting from source to target
|
|
245
|
+
self._copy_formatting_to_target(source_elem, target_elem, translation)
|
|
246
|
+
segments_updated += 1
|
|
247
|
+
|
|
248
|
+
# Update segment status to Confirmed
|
|
249
|
+
trans_unit.set('{MQXliff}status', 'Confirmed')
|
|
250
|
+
|
|
251
|
+
return segments_updated
|
|
252
|
+
|
|
253
|
+
def _copy_formatting_to_target(self, source_elem: ET.Element, target_elem: ET.Element, translation: str):
|
|
254
|
+
"""
|
|
255
|
+
Copy formatting structure from source to target and insert translation text.
|
|
256
|
+
|
|
257
|
+
Strategy:
|
|
258
|
+
1. If source has no formatting tags, just set plain text
|
|
259
|
+
2. If source has formatting, clone the structure and try to map text
|
|
260
|
+
3. For complex cases, preserve tag structure but use translation text
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
source_elem: Source XML element with formatting
|
|
264
|
+
target_elem: Target XML element to populate
|
|
265
|
+
translation: Translated text (plain)
|
|
266
|
+
"""
|
|
267
|
+
# Clear existing target content but preserve attributes
|
|
268
|
+
target_attribs = target_elem.attrib.copy()
|
|
269
|
+
target_elem.clear()
|
|
270
|
+
target_elem.tag = 'target' # Ensure it's a target element
|
|
271
|
+
|
|
272
|
+
# Restore important attributes
|
|
273
|
+
for key in ['{http://www.w3.org/XML/1998/namespace}space', 'mq:segpart']:
|
|
274
|
+
if key in target_attribs:
|
|
275
|
+
target_elem.set(key, target_attribs[key])
|
|
276
|
+
|
|
277
|
+
# Preserve xml:space="preserve" if source has it
|
|
278
|
+
space_attr = source_elem.get('{http://www.w3.org/XML/1998/namespace}space')
|
|
279
|
+
if space_attr:
|
|
280
|
+
target_elem.set('{http://www.w3.org/XML/1998/namespace}space', space_attr)
|
|
281
|
+
|
|
282
|
+
# Check if source has formatting tags (child elements)
|
|
283
|
+
has_formatting = len(list(source_elem)) > 0
|
|
284
|
+
|
|
285
|
+
if not has_formatting:
|
|
286
|
+
# Simple case: no formatting tags, just set text
|
|
287
|
+
target_elem.text = translation
|
|
288
|
+
else:
|
|
289
|
+
# Complex case: has formatting tags
|
|
290
|
+
# Strategy: Clone the structure and replace text content
|
|
291
|
+
self._clone_with_translation(source_elem, target_elem, translation)
|
|
292
|
+
|
|
293
|
+
def _clone_with_translation(self, source_elem: ET.Element, target_elem: ET.Element, translation: str):
|
|
294
|
+
"""
|
|
295
|
+
Clone source element structure to target, replacing text with translation.
|
|
296
|
+
|
|
297
|
+
Strategy: Clone the entire structure, then intelligently place translation text.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
source_elem: Source element to clone from
|
|
301
|
+
target_elem: Target element to populate
|
|
302
|
+
translation: Translation text to insert
|
|
303
|
+
"""
|
|
304
|
+
# Extract source text for comparison
|
|
305
|
+
source_text = self._extract_plain_text(source_elem)
|
|
306
|
+
|
|
307
|
+
# Clone all child elements (formatting tags) to preserve structure
|
|
308
|
+
# Also copy the text that appears before the first child
|
|
309
|
+
target_elem.text = source_elem.text
|
|
310
|
+
|
|
311
|
+
for child in source_elem:
|
|
312
|
+
cloned_child = self._deep_clone_element(child)
|
|
313
|
+
target_elem.append(cloned_child)
|
|
314
|
+
|
|
315
|
+
# Now replace the text content with the translation
|
|
316
|
+
# For complex nested structures, we need to be very careful about where we place text
|
|
317
|
+
# to avoid breaking the XML structure
|
|
318
|
+
|
|
319
|
+
# If source and translation are identical, structure is already correct
|
|
320
|
+
if source_text.strip() == translation.strip():
|
|
321
|
+
return
|
|
322
|
+
|
|
323
|
+
# Try to place the translation intelligently
|
|
324
|
+
self._place_translation_carefully(target_elem, source_text, translation)
|
|
325
|
+
|
|
326
|
+
def _deep_clone_element(self, element: ET.Element) -> ET.Element:
|
|
327
|
+
"""Deep clone an XML element with all its children."""
|
|
328
|
+
cloned = ET.Element(element.tag, element.attrib)
|
|
329
|
+
cloned.text = element.text
|
|
330
|
+
cloned.tail = element.tail
|
|
331
|
+
|
|
332
|
+
for child in element:
|
|
333
|
+
cloned.append(self._deep_clone_element(child))
|
|
334
|
+
|
|
335
|
+
return cloned
|
|
336
|
+
|
|
337
|
+
def _place_translation_carefully(self, element: ET.Element, source_text: str, translation: str):
|
|
338
|
+
"""
|
|
339
|
+
Carefully place translation text in the element structure.
|
|
340
|
+
|
|
341
|
+
This is conservative: it only modifies text nodes that contain actual content words,
|
|
342
|
+
not formatting codes. For complex cases, it may preserve more source text structure
|
|
343
|
+
than ideal, but it won't break the XML.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
element: The target element to modify
|
|
347
|
+
source_text: Original source text
|
|
348
|
+
translation: Translation to place
|
|
349
|
+
"""
|
|
350
|
+
# Strategy: Find text nodes that contain actual words (not just "{}" or encoded tags)
|
|
351
|
+
# and replace them with corresponding parts of the translation
|
|
352
|
+
|
|
353
|
+
# For now, use a simple heuristic:
|
|
354
|
+
# If there's text in element.text, replace it
|
|
355
|
+
# If there's text in a child's tail (after a tag), replace it
|
|
356
|
+
# But DON'T touch text inside <bpt>/<ept> tags (that's formatting metadata)
|
|
357
|
+
|
|
358
|
+
# Collect all "real content" text nodes
|
|
359
|
+
real_content_nodes = []
|
|
360
|
+
|
|
361
|
+
if element.text and len(element.text.strip()) > 0:
|
|
362
|
+
# Check if it's not just whitespace or formatting codes
|
|
363
|
+
if not self._is_formatting_code(element.text):
|
|
364
|
+
real_content_nodes.append(('root_text', element.text))
|
|
365
|
+
|
|
366
|
+
# Check child tails (text after tags)
|
|
367
|
+
for i, child in enumerate(element):
|
|
368
|
+
if child.tail and len(child.tail.strip()) > 0:
|
|
369
|
+
if not self._is_formatting_code(child.tail):
|
|
370
|
+
real_content_nodes.append(('child_tail', i, child.tail))
|
|
371
|
+
|
|
372
|
+
# If we found content nodes, use simple replacement strategy
|
|
373
|
+
if real_content_nodes:
|
|
374
|
+
# Simple approach: Just try string replacement in each node
|
|
375
|
+
# This works for simple cases and won't break complex structures
|
|
376
|
+
for node_info in real_content_nodes:
|
|
377
|
+
if node_info[0] == 'root_text':
|
|
378
|
+
# Try to replace source words with translation words
|
|
379
|
+
if element.text:
|
|
380
|
+
element.text = element.text.replace(source_text.strip(), translation.strip())
|
|
381
|
+
elif node_info[0] == 'child_tail':
|
|
382
|
+
idx = node_info[1]
|
|
383
|
+
if element[idx].tail:
|
|
384
|
+
element[idx].tail = element[idx].tail.replace(source_text.strip(), translation.strip())
|
|
385
|
+
else:
|
|
386
|
+
# No obvious content nodes, check if text is inside nested structure
|
|
387
|
+
# For these complex cases, just place translation where the source text was found
|
|
388
|
+
self._recursive_text_replace(element, source_text, translation)
|
|
389
|
+
|
|
390
|
+
def _recursive_text_replace(self, element: ET.Element, old_text: str, new_text: str):
|
|
391
|
+
"""
|
|
392
|
+
Recursively search for old_text and replace with new_text.
|
|
393
|
+
Only replaces in text nodes, not in tag attributes or structure.
|
|
394
|
+
"""
|
|
395
|
+
if element.text and old_text.strip() in element.text:
|
|
396
|
+
element.text = element.text.replace(old_text.strip(), new_text.strip())
|
|
397
|
+
|
|
398
|
+
for child in element:
|
|
399
|
+
if child.tail and old_text.strip() in child.tail:
|
|
400
|
+
child.tail = child.tail.replace(old_text.strip(), new_text.strip())
|
|
401
|
+
# Recurse into children
|
|
402
|
+
self._recursive_text_replace(child, old_text, new_text)
|
|
403
|
+
|
|
404
|
+
def _replace_all_text_content(self, element: ET.Element, old_text: str, new_text: str):
|
|
405
|
+
"""
|
|
406
|
+
Replace text content in an element tree, handling text split across nodes.
|
|
407
|
+
|
|
408
|
+
The challenge: Source text like "Hello world" might be split as:
|
|
409
|
+
- element.text = "Hello "
|
|
410
|
+
- child[0].text = "world"
|
|
411
|
+
|
|
412
|
+
We need to collect all content text, replace it with the translation,
|
|
413
|
+
then put it back in the structure.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
element: The element to process
|
|
417
|
+
old_text: The original source text (plain, no tags)
|
|
418
|
+
new_text: The translation text to insert
|
|
419
|
+
"""
|
|
420
|
+
# Clean both texts for comparison
|
|
421
|
+
old_clean = old_text.strip()
|
|
422
|
+
new_clean = new_text.strip()
|
|
423
|
+
|
|
424
|
+
# If texts are identical, no replacement needed
|
|
425
|
+
if old_clean == new_clean:
|
|
426
|
+
return
|
|
427
|
+
|
|
428
|
+
# Find all content text nodes (excluding <bpt>/<ept> formatting codes)
|
|
429
|
+
content_nodes = []
|
|
430
|
+
|
|
431
|
+
# Check element.text (text before first child)
|
|
432
|
+
if element.text and not self._is_formatting_code(element.text):
|
|
433
|
+
content_nodes.append(('root', None, element.text))
|
|
434
|
+
|
|
435
|
+
# Check all children
|
|
436
|
+
for i, child in enumerate(element):
|
|
437
|
+
# For <bpt> and <ept> tags, their .text contains formatting codes like "{}" or "<hlnk...>"
|
|
438
|
+
# We should NOT treat this as content
|
|
439
|
+
if child.tag not in ['bpt', 'ept']:
|
|
440
|
+
if child.text and not self._is_formatting_code(child.text):
|
|
441
|
+
content_nodes.append(('child_text', i, child.text))
|
|
442
|
+
|
|
443
|
+
# child.tail is text AFTER the child tag, this is content
|
|
444
|
+
if child.tail and not self._is_formatting_code(child.tail):
|
|
445
|
+
content_nodes.append(('child_tail', i, child.tail))
|
|
446
|
+
|
|
447
|
+
# If no content nodes, nothing to replace
|
|
448
|
+
if not content_nodes:
|
|
449
|
+
return
|
|
450
|
+
|
|
451
|
+
# Strategy: Place entire translation in the first content node, clear others
|
|
452
|
+
first_node = content_nodes[0]
|
|
453
|
+
node_type, node_index, node_text = first_node
|
|
454
|
+
|
|
455
|
+
if node_type == 'root':
|
|
456
|
+
element.text = new_clean
|
|
457
|
+
elif node_type == 'child_text':
|
|
458
|
+
element[node_index].text = new_clean
|
|
459
|
+
elif node_type == 'child_tail':
|
|
460
|
+
element[node_index].tail = new_clean
|
|
461
|
+
|
|
462
|
+
# Clear all other content nodes
|
|
463
|
+
for node in content_nodes[1:]:
|
|
464
|
+
node_type, node_index, node_text = node
|
|
465
|
+
if node_type == 'root':
|
|
466
|
+
element.text = ""
|
|
467
|
+
elif node_type == 'child_text':
|
|
468
|
+
element[node_index].text = ""
|
|
469
|
+
elif node_type == 'child_tail':
|
|
470
|
+
element[node_index].tail = ""
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def _is_formatting_code(self, text: str) -> bool:
|
|
474
|
+
"""
|
|
475
|
+
Check if text is a formatting code rather than actual content.
|
|
476
|
+
Formatting codes include: "{}", "<...>", whitespace-only
|
|
477
|
+
"""
|
|
478
|
+
if not text:
|
|
479
|
+
return True
|
|
480
|
+
|
|
481
|
+
text_stripped = text.strip()
|
|
482
|
+
if not text_stripped:
|
|
483
|
+
return True # Whitespace only
|
|
484
|
+
|
|
485
|
+
# Check for common formatting placeholders
|
|
486
|
+
if text_stripped == "{}":
|
|
487
|
+
return True
|
|
488
|
+
|
|
489
|
+
# Check for encoded XML tags (formatting metadata)
|
|
490
|
+
if text_stripped.startswith("<") and text_stripped.endswith(">"):
|
|
491
|
+
return True
|
|
492
|
+
|
|
493
|
+
return False
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def save(self, output_path: str) -> bool:
|
|
497
|
+
"""
|
|
498
|
+
Save the modified MQXLIFF file with proper namespace handling.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
output_path: Path where to save the file
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
True if saved successfully, False otherwise
|
|
505
|
+
"""
|
|
506
|
+
try:
|
|
507
|
+
if self.tree is None:
|
|
508
|
+
return False
|
|
509
|
+
|
|
510
|
+
# Register namespaces to avoid namespace prefix issues
|
|
511
|
+
# This ensures the default namespace is used correctly
|
|
512
|
+
ET.register_namespace('', 'urn:oasis:names:tc:xliff:document:1.2')
|
|
513
|
+
ET.register_namespace('mq', 'MQXliff')
|
|
514
|
+
ET.register_namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance')
|
|
515
|
+
|
|
516
|
+
# Write with XML declaration and UTF-8 encoding
|
|
517
|
+
self.tree.write(output_path, encoding='utf-8', xml_declaration=True, method='xml')
|
|
518
|
+
|
|
519
|
+
# Post-process to fix namespace issues that ElementTree might create
|
|
520
|
+
# Read the file and ensure proper structure
|
|
521
|
+
self._fix_namespace_prefixes(output_path)
|
|
522
|
+
|
|
523
|
+
return True
|
|
524
|
+
except Exception as e:
|
|
525
|
+
print(f"[MQXLIFF] Error saving file: {e}")
|
|
526
|
+
return False
|
|
527
|
+
|
|
528
|
+
def _fix_namespace_prefixes(self, file_path: str):
|
|
529
|
+
"""
|
|
530
|
+
Fix namespace prefix issues in the saved file.
|
|
531
|
+
ElementTree sometimes adds unwanted prefixes. This method ensures
|
|
532
|
+
the file matches the expected MQXLIFF format.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
file_path: Path to the file to fix
|
|
536
|
+
"""
|
|
537
|
+
try:
|
|
538
|
+
# Read the file
|
|
539
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
540
|
+
content = f.read()
|
|
541
|
+
|
|
542
|
+
# Fix common ElementTree namespace issues
|
|
543
|
+
# Replace xliff:xliff with xliff (default namespace)
|
|
544
|
+
content = content.replace('<xliff:xliff ', '<xliff ')
|
|
545
|
+
content = content.replace('</xliff:xliff>', '</xliff>')
|
|
546
|
+
content = content.replace('xmlns:xliff="urn:oasis:names:tc:xliff:document:1.2"',
|
|
547
|
+
'xmlns="urn:oasis:names:tc:xliff:document:1.2"')
|
|
548
|
+
|
|
549
|
+
# Remove xliff: prefixes from standard XLIFF elements
|
|
550
|
+
# but keep mq: prefixes for memoQ extensions
|
|
551
|
+
for tag in ['file', 'header', 'tool', 'body', 'trans-unit', 'source', 'target',
|
|
552
|
+
'context-group', 'context', 'bpt', 'ept', 'ph', 'it', 'x']:
|
|
553
|
+
content = content.replace(f'<xliff:{tag} ', f'<{tag} ')
|
|
554
|
+
content = content.replace(f'<xliff:{tag}>', f'<{tag}>')
|
|
555
|
+
content = content.replace(f'</xliff:{tag}>', f'</{tag}>')
|
|
556
|
+
|
|
557
|
+
# Write back the corrected content
|
|
558
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
559
|
+
f.write(content)
|
|
560
|
+
|
|
561
|
+
except Exception as e:
|
|
562
|
+
print(f"[MQXLIFF] Warning: Could not fix namespace prefixes: {e}")
|
|
563
|
+
# Non-fatal - file might still work
|
|
564
|
+
|
|
565
|
+
def get_segment_count(self) -> int:
|
|
566
|
+
"""Get the number of translatable segments (excluding auxiliary segments)."""
|
|
567
|
+
if self.body_element is None:
|
|
568
|
+
return 0
|
|
569
|
+
|
|
570
|
+
trans_units = self.body_element.findall('.//xliff:trans-unit', self.NAMESPACES)
|
|
571
|
+
if not trans_units:
|
|
572
|
+
trans_units = self.body_element.findall('.//trans-unit')
|
|
573
|
+
|
|
574
|
+
count = 0
|
|
575
|
+
for trans_unit in trans_units:
|
|
576
|
+
nosplitjoin = trans_unit.get('{MQXliff}nosplitjoin', 'false')
|
|
577
|
+
if nosplitjoin != 'true':
|
|
578
|
+
count += 1
|
|
579
|
+
|
|
580
|
+
return count
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def test_mqxliff_handler():
|
|
584
|
+
"""Test function to verify MQXLIFF handler functionality."""
|
|
585
|
+
import sys
|
|
586
|
+
|
|
587
|
+
if len(sys.argv) < 2:
|
|
588
|
+
print("Usage: python mqxliff_handler.py <path_to_mqxliff_file>")
|
|
589
|
+
return
|
|
590
|
+
|
|
591
|
+
file_path = sys.argv[1]
|
|
592
|
+
|
|
593
|
+
print(f"Testing MQXLIFF Handler with: {file_path}")
|
|
594
|
+
print("=" * 60)
|
|
595
|
+
|
|
596
|
+
handler = MQXLIFFHandler()
|
|
597
|
+
|
|
598
|
+
# Load file
|
|
599
|
+
if not handler.load(file_path):
|
|
600
|
+
print("Failed to load file!")
|
|
601
|
+
return
|
|
602
|
+
|
|
603
|
+
print(f"✓ File loaded successfully")
|
|
604
|
+
print(f" Source language: {handler.source_lang}")
|
|
605
|
+
print(f" Target language: {handler.target_lang}")
|
|
606
|
+
print(f" Segment count: {handler.get_segment_count()}")
|
|
607
|
+
print()
|
|
608
|
+
|
|
609
|
+
# Extract segments
|
|
610
|
+
segments = handler.extract_source_segments()
|
|
611
|
+
print(f"✓ Extracted {len(segments)} segments")
|
|
612
|
+
print()
|
|
613
|
+
|
|
614
|
+
# Display first 5 segments
|
|
615
|
+
print("First 5 segments:")
|
|
616
|
+
for i, seg in enumerate(segments[:5], 1):
|
|
617
|
+
print(f"\n Segment {i} (ID: {seg.id}):")
|
|
618
|
+
print(f" Plain text: {seg.plain_text}")
|
|
619
|
+
if seg.formatting_tags:
|
|
620
|
+
print(f" Formatting: {seg.formatting_tags}")
|
|
621
|
+
|
|
622
|
+
# Test update (with dummy translations)
|
|
623
|
+
print("\n" + "=" * 60)
|
|
624
|
+
print("Testing update with dummy translations...")
|
|
625
|
+
dummy_translations = [f"TRANSLATED: {seg.plain_text}" for seg in segments]
|
|
626
|
+
updated_count = handler.update_target_segments(dummy_translations)
|
|
627
|
+
print(f"✓ Updated {updated_count} target segments")
|
|
628
|
+
|
|
629
|
+
# Save test output
|
|
630
|
+
output_path = file_path.replace('.mqxliff', '_test_output.mqxliff')
|
|
631
|
+
if handler.save(output_path):
|
|
632
|
+
print(f"✓ Saved test output to: {output_path}")
|
|
633
|
+
else:
|
|
634
|
+
print("✗ Failed to save output")
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
if __name__ == "__main__":
|
|
638
|
+
test_mqxliff_handler()
|