supervertaler 1.9.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

Files changed (85) hide show
  1. Supervertaler.py +47886 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1878 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1172 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.153.dist-info/METADATA +896 -0
  81. supervertaler-1.9.153.dist-info/RECORD +85 -0
  82. supervertaler-1.9.153.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.153.dist-info/top_level.txt +2 -0
@@ -0,0 +1,638 @@
1
+ """
2
+ MQXLIFF Handler Module
3
+ ======================
4
+ Handles import/export of memoQ XLIFF (.mqxliff) files with proper formatting preservation.
5
+
6
+ MQXLIFF is an XLIFF 1.2 format with memoQ-specific extensions for CAT tool metadata
7
+ and formatting tags. This module provides robust parsing and generation of MQXLIFF files
8
+ while preserving inline formatting (bold, italic, underline) and complex structures like
9
+ hyperlinks.
10
+
11
+ Key Features:
12
+ - Parse XLIFF trans-units with source and target segments
13
+ - Extract and preserve inline formatting tags (bpt/ept pairs)
14
+ - Handle complex nested structures (hyperlinks with formatting)
15
+ - Generate valid MQXLIFF output with proper tag structure
16
+ - Maintain segment IDs and memoQ metadata
17
+
18
+ Formatting Tag Structure:
19
+ - <bpt id="X" ctype="bold">{}</bpt>...<ept id="X">{}</ept> - Bold text
20
+ - <bpt id="X" ctype="italic">{}</bpt>...<ept id="X">{}</ept> - Italic text
21
+ - <bpt id="X" ctype="underlined">{}</bpt>...<ept id="X">{}</ept> - Underlined text
22
+ - Nested tags for hyperlinks: <bpt><bpt><bpt>text</ept></ept></ept>
23
+ """
24
+
25
+ import xml.etree.ElementTree as ET
26
+ from typing import List, Dict, Tuple, Optional
27
+ import re
28
+
29
+
30
+ class FormattedSegment:
31
+ """Represents a segment with inline formatting information."""
32
+
33
+ def __init__(self, segment_id: str, plain_text: str, formatted_xml: str):
34
+ """
35
+ Initialize a formatted segment.
36
+
37
+ Args:
38
+ segment_id: Unique identifier for the segment (trans-unit id)
39
+ plain_text: Plain text without any formatting tags
40
+ formatted_xml: XML string with formatting tags preserved
41
+ """
42
+ self.id = segment_id
43
+ self.plain_text = plain_text
44
+ self.formatted_xml = formatted_xml
45
+ self.formatting_tags = self._extract_formatting_tags(formatted_xml)
46
+
47
+ def _extract_formatting_tags(self, xml_str: str) -> List[Dict]:
48
+ """Extract formatting tag information from XML string."""
49
+ tags = []
50
+ # Match bpt tags with ctype attribute
51
+ bpt_pattern = r'<bpt\s+id="(\d+)"\s+(?:rid="(\d+)"\s+)?ctype="([^"]+)">[^<]*</bpt>'
52
+ for match in re.finditer(bpt_pattern, xml_str):
53
+ tag_id = match.group(1)
54
+ ctype = match.group(3)
55
+ tags.append({
56
+ 'id': tag_id,
57
+ 'type': ctype,
58
+ 'is_bpt': True
59
+ })
60
+ return tags
61
+
62
+ def __repr__(self):
63
+ return f"FormattedSegment(id={self.id}, text='{self.plain_text[:50]}...', tags={len(self.formatting_tags)})"
64
+
65
+
66
+ class MQXLIFFHandler:
67
+ """Handler for parsing and generating memoQ XLIFF files."""
68
+
69
+ # Namespaces used in MQXLIFF files
70
+ NAMESPACES = {
71
+ 'xliff': 'urn:oasis:names:tc:xliff:document:1.2',
72
+ 'mq': 'MQXliff'
73
+ }
74
+
75
+ def __init__(self):
76
+ """Initialize the MQXLIFF handler."""
77
+ self.tree = None
78
+ self.root = None
79
+ self.file_element = None
80
+ self.body_element = None
81
+ self.source_lang = None
82
+ self.target_lang = None
83
+
84
+ def load(self, file_path: str) -> bool:
85
+ """
86
+ Load and parse an MQXLIFF file.
87
+
88
+ Args:
89
+ file_path: Path to the .mqxliff file
90
+
91
+ Returns:
92
+ True if loaded successfully, False otherwise
93
+ """
94
+ try:
95
+ # Register namespaces for proper parsing
96
+ for prefix, uri in self.NAMESPACES.items():
97
+ ET.register_namespace(prefix, uri)
98
+
99
+ self.tree = ET.parse(file_path)
100
+ self.root = self.tree.getroot()
101
+
102
+ # Find the file element
103
+ self.file_element = self.root.find('.//xliff:file', self.NAMESPACES)
104
+ if self.file_element is None:
105
+ # Try without namespace
106
+ self.file_element = self.root.find('.//file')
107
+
108
+ if self.file_element is not None:
109
+ self.source_lang = self.file_element.get('source-language', 'unknown')
110
+ self.target_lang = self.file_element.get('target-language', 'unknown')
111
+
112
+ # Find the body element
113
+ self.body_element = self.root.find('.//xliff:body', self.NAMESPACES)
114
+ if self.body_element is None:
115
+ # Try without namespace
116
+ self.body_element = self.root.find('.//body')
117
+
118
+ return True
119
+ except Exception as e:
120
+ print(f"[MQXLIFF] Error loading file: {e}")
121
+ return False
122
+
123
+ def extract_source_segments(self) -> List[FormattedSegment]:
124
+ """
125
+ Extract all source segments from the MQXLIFF file.
126
+
127
+ Returns:
128
+ List of FormattedSegment objects containing source text and formatting
129
+ """
130
+ segments = []
131
+
132
+ if self.body_element is None:
133
+ return segments
134
+
135
+ # Find all trans-unit elements (with or without namespace)
136
+ trans_units = self.body_element.findall('.//xliff:trans-unit', self.NAMESPACES)
137
+ if not trans_units:
138
+ trans_units = self.body_element.findall('.//trans-unit')
139
+
140
+ for trans_unit in trans_units:
141
+ trans_unit_id = trans_unit.get('id', 'unknown')
142
+
143
+ # Skip auxiliary segments (like hyperlink URLs with mq:nosplitjoin="true")
144
+ nosplitjoin = trans_unit.get('{MQXliff}nosplitjoin', 'false')
145
+ if nosplitjoin == 'true':
146
+ continue
147
+
148
+ # Find source element
149
+ source_elem = trans_unit.find('xliff:source', self.NAMESPACES)
150
+ if source_elem is None:
151
+ source_elem = trans_unit.find('source')
152
+
153
+ if source_elem is not None:
154
+ # Get the XML string of the source element's content
155
+ formatted_xml = ET.tostring(source_elem, encoding='unicode', method='xml')
156
+
157
+ # Extract plain text (removing all tags)
158
+ plain_text = self._extract_plain_text(source_elem)
159
+
160
+ segment = FormattedSegment(trans_unit_id, plain_text, formatted_xml)
161
+ segments.append(segment)
162
+
163
+ return segments
164
+
165
+ def _extract_plain_text(self, element: ET.Element) -> str:
166
+ """
167
+ Recursively extract plain text from an XML element, stripping all tags.
168
+
169
+ Args:
170
+ element: The XML element to extract text from
171
+
172
+ Returns:
173
+ Plain text string with all tags removed (including {} placeholders)
174
+ """
175
+ text_parts = []
176
+
177
+ # Add the element's text
178
+ if element.text:
179
+ text_parts.append(element.text)
180
+
181
+ # Recursively process child elements
182
+ for child in element:
183
+ text_parts.append(self._extract_plain_text(child))
184
+ # Add the tail text (text after the child element's closing tag)
185
+ if child.tail:
186
+ text_parts.append(child.tail)
187
+
188
+ full_text = ''.join(text_parts)
189
+
190
+ # Remove {} placeholders that come from bpt/ept tags
191
+ # These are used in MQXLIFF to mark tag positions
192
+ full_text = full_text.replace('{}', '')
193
+
194
+ return full_text
195
+
196
+ def update_target_segments(self, translations: List[str]) -> int:
197
+ """
198
+ Update target segments in the MQXLIFF with translations.
199
+
200
+ This method attempts to preserve formatting from the source segment by:
201
+ 1. Copying the source formatting structure
202
+ 2. Replacing the text content with the translation
203
+ 3. Adjusting tag IDs to avoid conflicts
204
+
205
+ Args:
206
+ translations: List of translated strings (plain text)
207
+
208
+ Returns:
209
+ Number of segments updated
210
+ """
211
+ if self.body_element is None:
212
+ return 0
213
+
214
+ # Find all trans-unit elements
215
+ trans_units = self.body_element.findall('.//xliff:trans-unit', self.NAMESPACES)
216
+ if not trans_units:
217
+ trans_units = self.body_element.findall('.//trans-unit')
218
+
219
+ translation_idx = 0
220
+ segments_updated = 0
221
+
222
+ for trans_unit in trans_units:
223
+ # Skip auxiliary segments
224
+ nosplitjoin = trans_unit.get('{MQXliff}nosplitjoin', 'false')
225
+ if nosplitjoin == 'true':
226
+ continue
227
+
228
+ if translation_idx >= len(translations):
229
+ break
230
+
231
+ translation = translations[translation_idx]
232
+ translation_idx += 1
233
+
234
+ # Find source and target elements
235
+ source_elem = trans_unit.find('xliff:source', self.NAMESPACES)
236
+ if source_elem is None:
237
+ source_elem = trans_unit.find('source')
238
+
239
+ target_elem = trans_unit.find('xliff:target', self.NAMESPACES)
240
+ if target_elem is None:
241
+ target_elem = trans_unit.find('target')
242
+
243
+ if source_elem is not None and target_elem is not None:
244
+ # Copy formatting from source to target
245
+ self._copy_formatting_to_target(source_elem, target_elem, translation)
246
+ segments_updated += 1
247
+
248
+ # Update segment status to Confirmed
249
+ trans_unit.set('{MQXliff}status', 'Confirmed')
250
+
251
+ return segments_updated
252
+
253
+ def _copy_formatting_to_target(self, source_elem: ET.Element, target_elem: ET.Element, translation: str):
254
+ """
255
+ Copy formatting structure from source to target and insert translation text.
256
+
257
+ Strategy:
258
+ 1. If source has no formatting tags, just set plain text
259
+ 2. If source has formatting, clone the structure and try to map text
260
+ 3. For complex cases, preserve tag structure but use translation text
261
+
262
+ Args:
263
+ source_elem: Source XML element with formatting
264
+ target_elem: Target XML element to populate
265
+ translation: Translated text (plain)
266
+ """
267
+ # Clear existing target content but preserve attributes
268
+ target_attribs = target_elem.attrib.copy()
269
+ target_elem.clear()
270
+ target_elem.tag = 'target' # Ensure it's a target element
271
+
272
+ # Restore important attributes
273
+ for key in ['{http://www.w3.org/XML/1998/namespace}space', 'mq:segpart']:
274
+ if key in target_attribs:
275
+ target_elem.set(key, target_attribs[key])
276
+
277
+ # Preserve xml:space="preserve" if source has it
278
+ space_attr = source_elem.get('{http://www.w3.org/XML/1998/namespace}space')
279
+ if space_attr:
280
+ target_elem.set('{http://www.w3.org/XML/1998/namespace}space', space_attr)
281
+
282
+ # Check if source has formatting tags (child elements)
283
+ has_formatting = len(list(source_elem)) > 0
284
+
285
+ if not has_formatting:
286
+ # Simple case: no formatting tags, just set text
287
+ target_elem.text = translation
288
+ else:
289
+ # Complex case: has formatting tags
290
+ # Strategy: Clone the structure and replace text content
291
+ self._clone_with_translation(source_elem, target_elem, translation)
292
+
293
+ def _clone_with_translation(self, source_elem: ET.Element, target_elem: ET.Element, translation: str):
294
+ """
295
+ Clone source element structure to target, replacing text with translation.
296
+
297
+ Strategy: Clone the entire structure, then intelligently place translation text.
298
+
299
+ Args:
300
+ source_elem: Source element to clone from
301
+ target_elem: Target element to populate
302
+ translation: Translation text to insert
303
+ """
304
+ # Extract source text for comparison
305
+ source_text = self._extract_plain_text(source_elem)
306
+
307
+ # Clone all child elements (formatting tags) to preserve structure
308
+ # Also copy the text that appears before the first child
309
+ target_elem.text = source_elem.text
310
+
311
+ for child in source_elem:
312
+ cloned_child = self._deep_clone_element(child)
313
+ target_elem.append(cloned_child)
314
+
315
+ # Now replace the text content with the translation
316
+ # For complex nested structures, we need to be very careful about where we place text
317
+ # to avoid breaking the XML structure
318
+
319
+ # If source and translation are identical, structure is already correct
320
+ if source_text.strip() == translation.strip():
321
+ return
322
+
323
+ # Try to place the translation intelligently
324
+ self._place_translation_carefully(target_elem, source_text, translation)
325
+
326
+ def _deep_clone_element(self, element: ET.Element) -> ET.Element:
327
+ """Deep clone an XML element with all its children."""
328
+ cloned = ET.Element(element.tag, element.attrib)
329
+ cloned.text = element.text
330
+ cloned.tail = element.tail
331
+
332
+ for child in element:
333
+ cloned.append(self._deep_clone_element(child))
334
+
335
+ return cloned
336
+
337
+ def _place_translation_carefully(self, element: ET.Element, source_text: str, translation: str):
338
+ """
339
+ Carefully place translation text in the element structure.
340
+
341
+ This is conservative: it only modifies text nodes that contain actual content words,
342
+ not formatting codes. For complex cases, it may preserve more source text structure
343
+ than ideal, but it won't break the XML.
344
+
345
+ Args:
346
+ element: The target element to modify
347
+ source_text: Original source text
348
+ translation: Translation to place
349
+ """
350
+ # Strategy: Find text nodes that contain actual words (not just "{}" or encoded tags)
351
+ # and replace them with corresponding parts of the translation
352
+
353
+ # For now, use a simple heuristic:
354
+ # If there's text in element.text, replace it
355
+ # If there's text in a child's tail (after a tag), replace it
356
+ # But DON'T touch text inside <bpt>/<ept> tags (that's formatting metadata)
357
+
358
+ # Collect all "real content" text nodes
359
+ real_content_nodes = []
360
+
361
+ if element.text and len(element.text.strip()) > 0:
362
+ # Check if it's not just whitespace or formatting codes
363
+ if not self._is_formatting_code(element.text):
364
+ real_content_nodes.append(('root_text', element.text))
365
+
366
+ # Check child tails (text after tags)
367
+ for i, child in enumerate(element):
368
+ if child.tail and len(child.tail.strip()) > 0:
369
+ if not self._is_formatting_code(child.tail):
370
+ real_content_nodes.append(('child_tail', i, child.tail))
371
+
372
+ # If we found content nodes, use simple replacement strategy
373
+ if real_content_nodes:
374
+ # Simple approach: Just try string replacement in each node
375
+ # This works for simple cases and won't break complex structures
376
+ for node_info in real_content_nodes:
377
+ if node_info[0] == 'root_text':
378
+ # Try to replace source words with translation words
379
+ if element.text:
380
+ element.text = element.text.replace(source_text.strip(), translation.strip())
381
+ elif node_info[0] == 'child_tail':
382
+ idx = node_info[1]
383
+ if element[idx].tail:
384
+ element[idx].tail = element[idx].tail.replace(source_text.strip(), translation.strip())
385
+ else:
386
+ # No obvious content nodes, check if text is inside nested structure
387
+ # For these complex cases, just place translation where the source text was found
388
+ self._recursive_text_replace(element, source_text, translation)
389
+
390
+ def _recursive_text_replace(self, element: ET.Element, old_text: str, new_text: str):
391
+ """
392
+ Recursively search for old_text and replace with new_text.
393
+ Only replaces in text nodes, not in tag attributes or structure.
394
+ """
395
+ if element.text and old_text.strip() in element.text:
396
+ element.text = element.text.replace(old_text.strip(), new_text.strip())
397
+
398
+ for child in element:
399
+ if child.tail and old_text.strip() in child.tail:
400
+ child.tail = child.tail.replace(old_text.strip(), new_text.strip())
401
+ # Recurse into children
402
+ self._recursive_text_replace(child, old_text, new_text)
403
+
404
+ def _replace_all_text_content(self, element: ET.Element, old_text: str, new_text: str):
405
+ """
406
+ Replace text content in an element tree, handling text split across nodes.
407
+
408
+ The challenge: Source text like "Hello world" might be split as:
409
+ - element.text = "Hello "
410
+ - child[0].text = "world"
411
+
412
+ We need to collect all content text, replace it with the translation,
413
+ then put it back in the structure.
414
+
415
+ Args:
416
+ element: The element to process
417
+ old_text: The original source text (plain, no tags)
418
+ new_text: The translation text to insert
419
+ """
420
+ # Clean both texts for comparison
421
+ old_clean = old_text.strip()
422
+ new_clean = new_text.strip()
423
+
424
+ # If texts are identical, no replacement needed
425
+ if old_clean == new_clean:
426
+ return
427
+
428
+ # Find all content text nodes (excluding <bpt>/<ept> formatting codes)
429
+ content_nodes = []
430
+
431
+ # Check element.text (text before first child)
432
+ if element.text and not self._is_formatting_code(element.text):
433
+ content_nodes.append(('root', None, element.text))
434
+
435
+ # Check all children
436
+ for i, child in enumerate(element):
437
+ # For <bpt> and <ept> tags, their .text contains formatting codes like "{}" or "&lt;hlnk...&gt;"
438
+ # We should NOT treat this as content
439
+ if child.tag not in ['bpt', 'ept']:
440
+ if child.text and not self._is_formatting_code(child.text):
441
+ content_nodes.append(('child_text', i, child.text))
442
+
443
+ # child.tail is text AFTER the child tag, this is content
444
+ if child.tail and not self._is_formatting_code(child.tail):
445
+ content_nodes.append(('child_tail', i, child.tail))
446
+
447
+ # If no content nodes, nothing to replace
448
+ if not content_nodes:
449
+ return
450
+
451
+ # Strategy: Place entire translation in the first content node, clear others
452
+ first_node = content_nodes[0]
453
+ node_type, node_index, node_text = first_node
454
+
455
+ if node_type == 'root':
456
+ element.text = new_clean
457
+ elif node_type == 'child_text':
458
+ element[node_index].text = new_clean
459
+ elif node_type == 'child_tail':
460
+ element[node_index].tail = new_clean
461
+
462
+ # Clear all other content nodes
463
+ for node in content_nodes[1:]:
464
+ node_type, node_index, node_text = node
465
+ if node_type == 'root':
466
+ element.text = ""
467
+ elif node_type == 'child_text':
468
+ element[node_index].text = ""
469
+ elif node_type == 'child_tail':
470
+ element[node_index].tail = ""
471
+
472
+
473
+ def _is_formatting_code(self, text: str) -> bool:
474
+ """
475
+ Check if text is a formatting code rather than actual content.
476
+ Formatting codes include: "{}", "&lt;...&gt;", whitespace-only
477
+ """
478
+ if not text:
479
+ return True
480
+
481
+ text_stripped = text.strip()
482
+ if not text_stripped:
483
+ return True # Whitespace only
484
+
485
+ # Check for common formatting placeholders
486
+ if text_stripped == "{}":
487
+ return True
488
+
489
+ # Check for encoded XML tags (formatting metadata)
490
+ if text_stripped.startswith("&lt;") and text_stripped.endswith("&gt;"):
491
+ return True
492
+
493
+ return False
494
+
495
+
496
+ def save(self, output_path: str) -> bool:
497
+ """
498
+ Save the modified MQXLIFF file with proper namespace handling.
499
+
500
+ Args:
501
+ output_path: Path where to save the file
502
+
503
+ Returns:
504
+ True if saved successfully, False otherwise
505
+ """
506
+ try:
507
+ if self.tree is None:
508
+ return False
509
+
510
+ # Register namespaces to avoid namespace prefix issues
511
+ # This ensures the default namespace is used correctly
512
+ ET.register_namespace('', 'urn:oasis:names:tc:xliff:document:1.2')
513
+ ET.register_namespace('mq', 'MQXliff')
514
+ ET.register_namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance')
515
+
516
+ # Write with XML declaration and UTF-8 encoding
517
+ self.tree.write(output_path, encoding='utf-8', xml_declaration=True, method='xml')
518
+
519
+ # Post-process to fix namespace issues that ElementTree might create
520
+ # Read the file and ensure proper structure
521
+ self._fix_namespace_prefixes(output_path)
522
+
523
+ return True
524
+ except Exception as e:
525
+ print(f"[MQXLIFF] Error saving file: {e}")
526
+ return False
527
+
528
+ def _fix_namespace_prefixes(self, file_path: str):
529
+ """
530
+ Fix namespace prefix issues in the saved file.
531
+ ElementTree sometimes adds unwanted prefixes. This method ensures
532
+ the file matches the expected MQXLIFF format.
533
+
534
+ Args:
535
+ file_path: Path to the file to fix
536
+ """
537
+ try:
538
+ # Read the file
539
+ with open(file_path, 'r', encoding='utf-8') as f:
540
+ content = f.read()
541
+
542
+ # Fix common ElementTree namespace issues
543
+ # Replace xliff:xliff with xliff (default namespace)
544
+ content = content.replace('<xliff:xliff ', '<xliff ')
545
+ content = content.replace('</xliff:xliff>', '</xliff>')
546
+ content = content.replace('xmlns:xliff="urn:oasis:names:tc:xliff:document:1.2"',
547
+ 'xmlns="urn:oasis:names:tc:xliff:document:1.2"')
548
+
549
+ # Remove xliff: prefixes from standard XLIFF elements
550
+ # but keep mq: prefixes for memoQ extensions
551
+ for tag in ['file', 'header', 'tool', 'body', 'trans-unit', 'source', 'target',
552
+ 'context-group', 'context', 'bpt', 'ept', 'ph', 'it', 'x']:
553
+ content = content.replace(f'<xliff:{tag} ', f'<{tag} ')
554
+ content = content.replace(f'<xliff:{tag}>', f'<{tag}>')
555
+ content = content.replace(f'</xliff:{tag}>', f'</{tag}>')
556
+
557
+ # Write back the corrected content
558
+ with open(file_path, 'w', encoding='utf-8') as f:
559
+ f.write(content)
560
+
561
+ except Exception as e:
562
+ print(f"[MQXLIFF] Warning: Could not fix namespace prefixes: {e}")
563
+ # Non-fatal - file might still work
564
+
565
+ def get_segment_count(self) -> int:
566
+ """Get the number of translatable segments (excluding auxiliary segments)."""
567
+ if self.body_element is None:
568
+ return 0
569
+
570
+ trans_units = self.body_element.findall('.//xliff:trans-unit', self.NAMESPACES)
571
+ if not trans_units:
572
+ trans_units = self.body_element.findall('.//trans-unit')
573
+
574
+ count = 0
575
+ for trans_unit in trans_units:
576
+ nosplitjoin = trans_unit.get('{MQXliff}nosplitjoin', 'false')
577
+ if nosplitjoin != 'true':
578
+ count += 1
579
+
580
+ return count
581
+
582
+
583
+ def test_mqxliff_handler():
584
+ """Test function to verify MQXLIFF handler functionality."""
585
+ import sys
586
+
587
+ if len(sys.argv) < 2:
588
+ print("Usage: python mqxliff_handler.py <path_to_mqxliff_file>")
589
+ return
590
+
591
+ file_path = sys.argv[1]
592
+
593
+ print(f"Testing MQXLIFF Handler with: {file_path}")
594
+ print("=" * 60)
595
+
596
+ handler = MQXLIFFHandler()
597
+
598
+ # Load file
599
+ if not handler.load(file_path):
600
+ print("Failed to load file!")
601
+ return
602
+
603
+ print(f"✓ File loaded successfully")
604
+ print(f" Source language: {handler.source_lang}")
605
+ print(f" Target language: {handler.target_lang}")
606
+ print(f" Segment count: {handler.get_segment_count()}")
607
+ print()
608
+
609
+ # Extract segments
610
+ segments = handler.extract_source_segments()
611
+ print(f"✓ Extracted {len(segments)} segments")
612
+ print()
613
+
614
+ # Display first 5 segments
615
+ print("First 5 segments:")
616
+ for i, seg in enumerate(segments[:5], 1):
617
+ print(f"\n Segment {i} (ID: {seg.id}):")
618
+ print(f" Plain text: {seg.plain_text}")
619
+ if seg.formatting_tags:
620
+ print(f" Formatting: {seg.formatting_tags}")
621
+
622
+ # Test update (with dummy translations)
623
+ print("\n" + "=" * 60)
624
+ print("Testing update with dummy translations...")
625
+ dummy_translations = [f"TRANSLATED: {seg.plain_text}" for seg in segments]
626
+ updated_count = handler.update_target_segments(dummy_translations)
627
+ print(f"✓ Updated {updated_count} target segments")
628
+
629
+ # Save test output
630
+ output_path = file_path.replace('.mqxliff', '_test_output.mqxliff')
631
+ if handler.save(output_path):
632
+ print(f"✓ Saved test output to: {output_path}")
633
+ else:
634
+ print("✗ Failed to save output")
635
+
636
+
637
+ if __name__ == "__main__":
638
+ test_mqxliff_handler()