supervertaler 1.9.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

Files changed (85) hide show
  1. Supervertaler.py +47886 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1878 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1172 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.153.dist-info/METADATA +896 -0
  81. supervertaler-1.9.153.dist-info/RECORD +85 -0
  82. supervertaler-1.9.153.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.153.dist-info/top_level.txt +2 -0
@@ -0,0 +1,430 @@
1
+ """
2
+ Trados Bilingual DOCX Handler (Review Files)
3
+
4
+ This module handles the import and export of Trados Studio bilingual review DOCX files.
5
+ Trados uses a table-based format with numbered inline tags.
6
+
7
+ Format Structure:
8
+ - Table with columns: Segment ID | Segment status | Source segment | Target segment
9
+ - Tags use character style "Tag" and format: <N>text</N>
10
+ - Segment IDs are GUIDs with numeric prefixes
11
+ - Statuses: "Not Translated", "Draft", "Translated", etc.
12
+
13
+ Critical for re-import:
14
+ - Tags MUST preserve the "Tag" character style
15
+ - Tag numbers must match between source and target
16
+ - Segment IDs must remain unchanged
17
+ """
18
+
19
+ import os
20
+ import re
21
+ from docx import Document
22
+ from docx.shared import RGBColor, Pt
23
+ from docx.oxml.ns import qn
24
+ from lxml import etree
25
+ from typing import List, Dict, Tuple, Optional
26
+ from copy import deepcopy
27
+
28
+
29
+ class TradosSegment:
30
+ """
31
+ Represents a Trados segment with tag information.
32
+ """
33
+ def __init__(self, segment_id: str, status: str, source_text: str, target_text: str = "",
34
+ source_runs: List[Dict] = None, row_index: int = 0):
35
+ self.segment_id = segment_id
36
+ self.status = status
37
+ self.source_text = source_text # Plain text with tags as text
38
+ self.target_text = target_text
39
+ self.source_runs = source_runs or [] # List of {text, is_tag, style_xml} dicts
40
+ self.row_index = row_index
41
+
42
+ # Extract tags from source for validation
43
+ self.source_tags = self._extract_tags(source_text)
44
+
45
+ def _extract_tags(self, text: str) -> List[str]:
46
+ """Extract all tags from text."""
47
+ pattern = r'</?(\d+)>'
48
+ return re.findall(pattern, text)
49
+
50
+ @property
51
+ def plain_source(self) -> str:
52
+ """Get source text without tags for translation."""
53
+ return re.sub(r'</?(\d+)>', '', self.source_text)
54
+
55
+ def __repr__(self):
56
+ return f"TradosSegment(id={self.segment_id[:20]}..., status={self.status}, source={self.source_text[:40]}...)"
57
+
58
+
59
+ class TradosDOCXHandler:
60
+ """
61
+ Handler for Trados Studio bilingual review DOCX files.
62
+
63
+ This class provides methods to:
64
+ - Load and parse Trados bilingual review DOCX files
65
+ - Extract source segments with tag markers
66
+ - Update target segments with translations (preserving tag style)
67
+ - Save modified files ready for re-import to Trados
68
+ """
69
+
70
+ # Trados tag pattern: <N> or </N> where N is a number
71
+ TAG_PATTERN = re.compile(r'(</?(\d+)>)')
72
+
73
+ def __init__(self):
74
+ self.doc = None
75
+ self.table = None
76
+ self.segments: List[TradosSegment] = []
77
+ self.file_path = None
78
+ self.header_row = None
79
+ self.tag_style_xml = None # Store the Tag style XML for reuse
80
+
81
+ def load(self, file_path: str) -> bool:
82
+ """
83
+ Load a Trados bilingual review DOCX file.
84
+
85
+ Args:
86
+ file_path: Path to the Trados bilingual DOCX file
87
+
88
+ Returns:
89
+ bool: True if loaded successfully, False otherwise
90
+ """
91
+ try:
92
+ self.file_path = file_path
93
+ self.doc = Document(file_path)
94
+
95
+ # Trados bilingual files should have exactly one table
96
+ if len(self.doc.tables) == 0:
97
+ print(f"ERROR: No table found in {file_path}")
98
+ return False
99
+
100
+ self.table = self.doc.tables[0]
101
+
102
+ # Verify the header row
103
+ if len(self.table.rows) < 2:
104
+ print(f"ERROR: Table has insufficient rows")
105
+ return False
106
+
107
+ self.header_row = [cell.text.strip() for cell in self.table.rows[0].cells]
108
+
109
+ # Check if this looks like a Trados bilingual DOCX
110
+ expected_headers = ['Segment ID', 'Segment status', 'Source segment', 'Target segment']
111
+ if self.header_row != expected_headers:
112
+ print(f"WARNING: Headers don't match expected Trados format")
113
+ print(f" Expected: {expected_headers}")
114
+ print(f" Found: {self.header_row}")
115
+ # Continue anyway if it's close enough
116
+ if 'Segment' not in self.header_row[0]:
117
+ return False
118
+
119
+ # Find and store the Tag style XML for later use
120
+ self._capture_tag_style()
121
+
122
+ print(f"Successfully loaded Trados bilingual DOCX: {file_path}")
123
+ print(f"Header: {self.header_row}")
124
+ print(f"Total rows (including header): {len(self.table.rows)}")
125
+
126
+ return True
127
+
128
+ except Exception as e:
129
+ print(f"ERROR loading Trados DOCX: {e}")
130
+ import traceback
131
+ traceback.print_exc()
132
+ return False
133
+
134
+ def _capture_tag_style(self):
135
+ """Find and capture the Tag style XML from the document."""
136
+ try:
137
+ # Look through the document for a run with Tag style
138
+ for row in self.table.rows[1:]:
139
+ source_cell = row.cells[2]
140
+ for para in source_cell.paragraphs:
141
+ for run in para.runs:
142
+ rPr = run._r.find(qn('w:rPr'))
143
+ if rPr is not None:
144
+ style_elem = rPr.find(qn('w:rStyle'))
145
+ if style_elem is not None and style_elem.get(qn('w:val')) == 'Tag':
146
+ # Found a Tag style - save the entire rPr as template
147
+ self.tag_style_xml = deepcopy(rPr)
148
+ print("Captured Tag style from document")
149
+ return
150
+ except Exception as e:
151
+ print(f"Warning: Could not capture Tag style: {e}")
152
+
153
+ def extract_source_segments(self) -> List[TradosSegment]:
154
+ """
155
+ Extract all source segments from the Trados bilingual DOCX.
156
+
157
+ Returns:
158
+ list: List of TradosSegment objects
159
+ """
160
+ self.segments = []
161
+
162
+ if not self.table:
163
+ print("ERROR: No table loaded")
164
+ return []
165
+
166
+ # Skip header row (index 0), process data rows
167
+ for i, row in enumerate(self.table.rows[1:], start=1):
168
+ try:
169
+ cells = row.cells
170
+
171
+ # Extract data from columns
172
+ segment_id = cells[0].text.strip()
173
+ status = cells[1].text.strip()
174
+ source_cell = cells[2]
175
+ target_cell = cells[3] if len(cells) > 3 else None
176
+
177
+ # Get source text
178
+ source_text = source_cell.text.strip()
179
+ target_text = target_cell.text.strip() if target_cell else ""
180
+
181
+ # Extract run information for preserving tag styles
182
+ source_runs = self._extract_runs_with_styles(source_cell)
183
+
184
+ # Create TradosSegment
185
+ segment = TradosSegment(
186
+ segment_id=segment_id,
187
+ status=status,
188
+ source_text=source_text,
189
+ target_text=target_text,
190
+ source_runs=source_runs,
191
+ row_index=i
192
+ )
193
+
194
+ self.segments.append(segment)
195
+
196
+ except Exception as e:
197
+ print(f"WARNING: Error processing row {i}: {e}")
198
+ continue
199
+
200
+ print(f"Extracted {len(self.segments)} segments from Trados DOCX")
201
+ return self.segments
202
+
203
+ def _extract_runs_with_styles(self, cell) -> List[Dict]:
204
+ """
205
+ Extract runs from a cell, noting which are tags.
206
+
207
+ Returns:
208
+ List of dicts with: {text, is_tag, style_xml}
209
+ """
210
+ runs = []
211
+ for para in cell.paragraphs:
212
+ for run in para.runs:
213
+ is_tag = False
214
+ style_xml = None
215
+
216
+ # Check if this run has Tag style
217
+ rPr = run._r.find(qn('w:rPr'))
218
+ if rPr is not None:
219
+ style_elem = rPr.find(qn('w:rStyle'))
220
+ if style_elem is not None and style_elem.get(qn('w:val')) == 'Tag':
221
+ is_tag = True
222
+ style_xml = deepcopy(rPr)
223
+
224
+ runs.append({
225
+ 'text': run.text,
226
+ 'is_tag': is_tag,
227
+ 'style_xml': style_xml
228
+ })
229
+
230
+ return runs
231
+
232
+ def update_target_segments(self, translations: Dict[int, str]) -> int:
233
+ """
234
+ Update target segments with translations.
235
+
236
+ Args:
237
+ translations: Dict mapping row index to translated text
238
+
239
+ Returns:
240
+ int: Number of segments updated
241
+ """
242
+ updated_count = 0
243
+
244
+ for idx, translation in translations.items():
245
+ if 0 < idx < len(self.table.rows):
246
+ row = self.table.rows[idx]
247
+ target_cell = row.cells[3]
248
+
249
+ # Get the source segment for tag info
250
+ source_cell = row.cells[2]
251
+
252
+ # Clear existing target content
253
+ for para in target_cell.paragraphs:
254
+ for run in list(para.runs):
255
+ run._r.getparent().remove(run._r)
256
+
257
+ # Write target with proper tag styling
258
+ self._write_text_with_tags(target_cell, translation, source_cell)
259
+
260
+ # Update status to indicate translation
261
+ status_cell = row.cells[1]
262
+ if status_cell.text.strip() == "Not Translated":
263
+ self._set_cell_text(status_cell, "Translated")
264
+
265
+ updated_count += 1
266
+
267
+ print(f"Updated {updated_count} target segments")
268
+ return updated_count
269
+
270
+ def _write_text_with_tags(self, target_cell, text: str, source_cell):
271
+ """
272
+ Write text to target cell, applying Tag style to tag patterns.
273
+
274
+ This ensures tags in the target have the same style as in the source,
275
+ which is critical for re-import into Trados.
276
+ """
277
+ if not target_cell.paragraphs:
278
+ return
279
+
280
+ para = target_cell.paragraphs[0]
281
+
282
+ # Use finditer to find all tags and their positions
283
+ # This avoids the complexity of split() with capturing groups
284
+ tag_pattern = re.compile(r'</?(\d+)>')
285
+
286
+ last_end = 0
287
+ for match in tag_pattern.finditer(text):
288
+ # Add any text before this tag
289
+ if match.start() > last_end:
290
+ plain_text = text[last_end:match.start()]
291
+ if plain_text:
292
+ run = para.add_run(plain_text)
293
+ self._apply_default_style(run)
294
+ self._set_xml_space_preserve(run)
295
+
296
+ # Add the tag itself with Tag style
297
+ tag_text = match.group(0) # e.g., "<11>" or "</11>"
298
+ run = para.add_run(tag_text)
299
+ self._apply_tag_style(run)
300
+ self._set_xml_space_preserve(run)
301
+
302
+ last_end = match.end()
303
+
304
+ # Add any remaining text after the last tag
305
+ if last_end < len(text):
306
+ remaining_text = text[last_end:]
307
+ if remaining_text:
308
+ run = para.add_run(remaining_text)
309
+ self._apply_default_style(run)
310
+ self._set_xml_space_preserve(run)
311
+
312
+ def _set_xml_space_preserve(self, run):
313
+ """Set xml:space='preserve' on the run's text element for proper whitespace handling."""
314
+ t_elem = run._r.find(qn('w:t'))
315
+ if t_elem is not None:
316
+ t_elem.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
317
+
318
+ def _apply_tag_style(self, run):
319
+ """Apply the Tag character style to a run."""
320
+ # Create rPr element if needed
321
+ rPr = run._r.find(qn('w:rPr'))
322
+ if rPr is None:
323
+ rPr = etree.SubElement(run._r, qn('w:rPr'))
324
+ run._r.insert(0, rPr)
325
+
326
+ # Add rStyle element with Tag value
327
+ style_elem = rPr.find(qn('w:rStyle'))
328
+ if style_elem is None:
329
+ style_elem = etree.SubElement(rPr, qn('w:rStyle'))
330
+ style_elem.set(qn('w:val'), 'Tag')
331
+
332
+ def _apply_default_style(self, run):
333
+ """Apply default style (language settings) to a run.
334
+
335
+ Note: For target text, we DON'T set language at run level.
336
+ The paragraph has its own default language (en-US for target),
337
+ and runs will inherit from that. Setting the source language
338
+ on target runs would confuse Trados.
339
+ """
340
+ # We intentionally don't set language here anymore
341
+ # Target runs should inherit from paragraph-level language setting
342
+ pass
343
+
344
+ def _set_cell_text(self, cell, text: str):
345
+ """Set cell text, preserving formatting."""
346
+ if cell.paragraphs:
347
+ para = cell.paragraphs[0]
348
+ # Clear existing runs
349
+ for run in list(para.runs):
350
+ run._r.getparent().remove(run._r)
351
+ para.add_run(text)
352
+
353
+ def save(self, output_path: str = None) -> bool:
354
+ """
355
+ Save the modified document.
356
+
357
+ Args:
358
+ output_path: Path to save to (defaults to original path)
359
+
360
+ Returns:
361
+ bool: True if saved successfully
362
+ """
363
+ try:
364
+ save_path = output_path or self.file_path
365
+ self.doc.save(save_path)
366
+ print(f"Saved Trados bilingual DOCX: {save_path}")
367
+ return True
368
+ except Exception as e:
369
+ print(f"ERROR saving Trados DOCX: {e}")
370
+ import traceback
371
+ traceback.print_exc()
372
+ return False
373
+
374
+ def get_segments_for_translation(self) -> List[Tuple[int, str, str]]:
375
+ """
376
+ Get segments that need translation.
377
+
378
+ Returns:
379
+ List of (row_index, source_text, plain_source) tuples
380
+ """
381
+ result = []
382
+ for seg in self.segments:
383
+ if seg.status == "Not Translated" or not seg.target_text:
384
+ result.append((seg.row_index, seg.source_text, seg.plain_source))
385
+ return result
386
+
387
+
388
+ def detect_bilingual_docx_type(file_path: str) -> str:
389
+ """
390
+ Detect the type of bilingual DOCX file.
391
+
392
+ Returns:
393
+ str: "trados", "cafetran", "memoq", "phrase", or "unknown"
394
+ """
395
+ try:
396
+ doc = Document(file_path)
397
+
398
+ if len(doc.tables) == 0:
399
+ return "unknown"
400
+
401
+ table = doc.tables[0]
402
+ if len(table.rows) < 1:
403
+ return "unknown"
404
+
405
+ headers = [cell.text.strip() for cell in table.rows[0].cells]
406
+
407
+ # Trados: Segment ID | Segment status | Source segment | Target segment
408
+ if headers and headers[0] == "Segment ID" and "Segment status" in headers:
409
+ return "trados"
410
+
411
+ # CafeTran: ID | filename | filename | Notes | *
412
+ if headers and headers[0] == "ID":
413
+ return "cafetran"
414
+
415
+ # Phrase (Memsource): Check for multiple large tables with 7-8 columns and segment IDs containing ':'
416
+ # Look for content tables with Phrase characteristics
417
+ for table in doc.tables:
418
+ if len(table.rows) > 100 and len(table.rows[0].cells) >= 7:
419
+ first_cell = table.rows[0].cells[0].text.strip()
420
+ if ':' in first_cell: # Segment IDs have format "xxx:nnn"
421
+ return "phrase"
422
+
423
+ # memoQ: Usually has different structure
424
+ # TODO: Add memoQ detection
425
+
426
+ return "unknown"
427
+
428
+ except Exception as e:
429
+ print(f"Error detecting bilingual DOCX type: {e}")
430
+ return "unknown"