supervertaler 1.9.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

Files changed (85) hide show
  1. Supervertaler.py +47886 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1878 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1172 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.153.dist-info/METADATA +896 -0
  81. supervertaler-1.9.153.dist-info/RECORD +85 -0
  82. supervertaler-1.9.153.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.153.dist-info/top_level.txt +2 -0
@@ -0,0 +1,516 @@
1
+ """
2
+ Phrase (Memsource) Bilingual DOCX Handler
3
+
4
+ This module handles the import and export of Phrase (formerly Memsource) bilingual DOCX files.
5
+ Phrase uses a multi-table format with numbered inline tags.
6
+
7
+ Format Structure:
8
+ - Multiple tables (typically 2 content tables + 3 metadata tables)
9
+ - Content tables with 7 columns:
10
+ 1. Segment ID (locked, gray D9D9D9)
11
+ 2. Empty (locked, gray D9D9D9)
12
+ 3. Segment number (locked, gray D9D9D9)
13
+ 4. Source text with tags (locked, gray D9D9D9)
14
+ 5. Target text with tags (EDITABLE, no shading)
15
+ 6. Status code (locked, colored: 774306=99/confirmed, 5B37C3=MT, etc.)
16
+ 7. Empty (no shading)
17
+
18
+ Tag System:
19
+ - Simple tags: {N} (e.g., {1}, {2})
20
+ - Formatting tags: {N>text<N} (e.g., {1>CAUTION<1})
21
+ - Empty formatting: {N><N}
22
+ - Closing tag variant: <N}
23
+ - Special content: {N>� <N} (non-breaking space), {N>on page N<N} (cross-ref)
24
+
25
+ Critical for re-import:
26
+ - Only Column 5 (target text) should be edited
27
+ - All other columns must remain unchanged
28
+ - Tags must be preserved in the target
29
+ - Cell shading/locking must be maintained
30
+ """
31
+
32
+ import os
33
+ import re
34
+ import zipfile
35
+ import xml.etree.ElementTree as ET
36
+ from docx import Document
37
+ from docx.shared import RGBColor, Pt
38
+ from docx.oxml.ns import qn
39
+ from lxml import etree
40
+ from typing import List, Dict, Tuple, Optional
41
+ from copy import deepcopy
42
+
43
+
44
+ class PhraseSegment:
45
+ """
46
+ Represents a Phrase segment with tag information.
47
+ """
48
+ def __init__(self, segment_id: str, segment_num: str, source_text: str,
49
+ target_text: str = "", status_code: str = "",
50
+ row_index: int = 0, table_index: int = 0):
51
+ self.segment_id = segment_id
52
+ self.segment_num = segment_num
53
+ self.source_text = source_text # Plain text with tags as text
54
+ self.target_text = target_text
55
+ self.status_code = status_code
56
+ self.row_index = row_index
57
+ self.table_index = table_index
58
+
59
+ # Extract tags from source for validation
60
+ self.source_tags = self._extract_tags(source_text)
61
+
62
+ def _extract_tags(self, text: str) -> List[str]:
63
+ """Extract all Phrase tag numbers from text."""
64
+ # Match {N}, {N>...<N}, <N}, {N><N}
65
+ pattern = r'\{(\d+)[>}]|<(\d+)\}'
66
+ matches = re.findall(pattern, text)
67
+ # Flatten tuples and remove empty strings
68
+ return [m for group in matches for m in group if m]
69
+
70
+ @property
71
+ def plain_source(self) -> str:
72
+ """Get source text without tags for translation."""
73
+ # Remove all Phrase tag patterns
74
+ text = re.sub(r'\{\d+\}', '', self.source_text) # {N}
75
+ text = re.sub(r'\{\d+>.*?<\d+\}', '', text) # {N>...<N}
76
+ text = re.sub(r'<\d+\}', '', text) # <N}
77
+ text = re.sub(r'\{\d+><\d+\}', '', text) # {N><N}
78
+ return text.strip()
79
+
80
+ def __repr__(self):
81
+ return f"PhraseSegment(id={self.segment_id[:20]}..., num={self.segment_num}, status={self.status_code})"
82
+
83
+
84
+ class PhraseDOCXHandler:
85
+ """
86
+ Handler for Phrase (Memsource) bilingual DOCX files.
87
+
88
+ This class provides methods to:
89
+ - Load and parse Phrase bilingual DOCX files
90
+ - Extract source segments with tag markers
91
+ - Update target segments with translations (preserving exact structure)
92
+ - Save modified files ready for re-import to Phrase
93
+ """
94
+
95
+ # Phrase tag patterns
96
+ TAG_SIMPLE = re.compile(r'\{\d+\}') # {1}
97
+ TAG_FORMATTED = re.compile(r'\{\d+>.*?<\d+\}') # {1>text<1}
98
+ TAG_CLOSING = re.compile(r'<\d+\}') # <1}
99
+ TAG_EMPTY = re.compile(r'\{\d+><\d+\}') # {1><1}
100
+ TAG_ALL = re.compile(r'\{\d+(?:>.*?<\d+)?\}|<\d+\}') # All patterns
101
+
102
+ def __init__(self):
103
+ self.doc = None
104
+ self.content_tables = [] # List of (table_obj, table_index) tuples
105
+ self.segments: List[PhraseSegment] = []
106
+ self.file_path = None
107
+
108
+ def load(self, file_path: str) -> bool:
109
+ """
110
+ Load a Phrase bilingual DOCX file.
111
+
112
+ Args:
113
+ file_path: Path to the Phrase bilingual DOCX file
114
+
115
+ Returns:
116
+ bool: True if loaded successfully, False otherwise
117
+ """
118
+ try:
119
+ self.file_path = file_path
120
+ self.doc = Document(file_path)
121
+
122
+ if len(self.doc.tables) == 0:
123
+ print(f"ERROR: No tables found in {file_path}")
124
+ return False
125
+
126
+ # Find content tables (tables with many rows and 7-8 columns)
127
+ self.content_tables = []
128
+ for idx, table in enumerate(self.doc.tables):
129
+ rows = table.rows
130
+ if len(rows) > 100 and len(rows[0].cells) >= 7:
131
+ # Check if first cell looks like a Phrase segment ID
132
+ first_cell = rows[0].cells[0].text.strip()
133
+ if ':' in first_cell: # Segment IDs have format "xxx:nnn"
134
+ self.content_tables.append((table, idx))
135
+ print(f"Found content table {idx} with {len(rows)} rows, {len(rows[0].cells)} columns")
136
+
137
+ if not self.content_tables:
138
+ print(f"ERROR: No Phrase content tables found")
139
+ return False
140
+
141
+ print(f"Successfully loaded Phrase bilingual DOCX: {file_path}")
142
+ print(f"Content tables: {len(self.content_tables)}")
143
+ print(f"Total segments: {sum(len(t[0].rows) for t in self.content_tables)}")
144
+
145
+ return True
146
+
147
+ except Exception as e:
148
+ print(f"ERROR loading Phrase DOCX: {e}")
149
+ import traceback
150
+ traceback.print_exc()
151
+ return False
152
+
153
+ def extract_source_segments(self) -> List[PhraseSegment]:
154
+ """
155
+ Extract all source segments from the Phrase bilingual DOCX.
156
+
157
+ Returns:
158
+ list: List of PhraseSegment objects
159
+ """
160
+ self.segments = []
161
+
162
+ if not self.content_tables:
163
+ print("ERROR: No content tables loaded")
164
+ return []
165
+
166
+ # Process each content table
167
+ for table_obj, table_idx in self.content_tables:
168
+ for row_idx, row in enumerate(table_obj.rows):
169
+ try:
170
+ cells = row.cells
171
+
172
+ # Extract data from columns
173
+ segment_id = cells[0].text.strip()
174
+ # Column 1 is empty
175
+ segment_num = cells[2].text.strip()
176
+
177
+ # Extract source and target with formatting as HTML tags
178
+ source_cell = cells[3]
179
+ target_cell = cells[4]
180
+ source_text = self._cell_to_tagged_text(source_cell)
181
+ target_text = self._cell_to_tagged_text(target_cell)
182
+
183
+ status_code = cells[5].text.strip()
184
+ # Column 6 is empty
185
+
186
+ # Create PhraseSegment
187
+ segment = PhraseSegment(
188
+ segment_id=segment_id,
189
+ segment_num=segment_num,
190
+ source_text=source_text,
191
+ target_text=target_text,
192
+ status_code=status_code,
193
+ row_index=row_idx,
194
+ table_index=table_idx
195
+ )
196
+
197
+ self.segments.append(segment)
198
+
199
+ except Exception as e:
200
+ print(f"WARNING: Error processing row {row_idx} in table {table_idx}: {e}")
201
+ continue
202
+
203
+ print(f"Extracted {len(self.segments)} segments from Phrase DOCX")
204
+ return self.segments
205
+
206
+ def update_target_segments(self, translations: Dict[str, str]) -> int:
207
+ """
208
+ Update target segments with translations.
209
+
210
+ Args:
211
+ translations: Dict mapping segment_id to translated text (with Phrase tags)
212
+
213
+ Returns:
214
+ int: Number of segments updated
215
+ """
216
+ updated_count = 0
217
+
218
+ # Build a lookup map: segment_id -> (table_obj, row_idx)
219
+ segment_map = {}
220
+ for table_obj, table_idx in self.content_tables:
221
+ for row_idx, row in enumerate(table_obj.rows):
222
+ segment_id = row.cells[0].text.strip()
223
+ segment_map[segment_id] = (table_obj, row_idx)
224
+
225
+ # Update translations
226
+ for segment_id, translation in translations.items():
227
+ if segment_id in segment_map:
228
+ table_obj, row_idx = segment_map[segment_id]
229
+ row = table_obj.rows[row_idx]
230
+ source_cell = row.cells[3] # Column 4 (source)
231
+ target_cell = row.cells[4] # Column 5 (target)
232
+
233
+ # Clear existing target content
234
+ self._clear_cell(target_cell)
235
+
236
+ # Write new translation copying formatting from source
237
+ self._set_cell_text_with_source_formatting(target_cell, translation, source_cell)
238
+
239
+ updated_count += 1
240
+
241
+ print(f"Updated {updated_count} target segments")
242
+ return updated_count
243
+
244
+ def _clear_cell(self, cell):
245
+ """Clear all content from a cell."""
246
+ for para in cell.paragraphs:
247
+ for run in list(para.runs):
248
+ run._r.getparent().remove(run._r)
249
+
250
+ def _set_cell_text(self, cell, text: str):
251
+ """Set cell text, preserving whitespace."""
252
+ if not cell.paragraphs:
253
+ return
254
+
255
+ para = cell.paragraphs[0]
256
+
257
+ # Clear existing runs
258
+ for run in list(para.runs):
259
+ run._r.getparent().remove(run._r)
260
+
261
+ # Add new text with xml:space='preserve' for proper whitespace handling
262
+ if text:
263
+ run = para.add_run(text)
264
+ t_elem = run._r.find(qn('w:t'))
265
+ if t_elem is not None:
266
+ t_elem.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
267
+
268
+ def _cell_to_tagged_text(self, cell) -> str:
269
+ """
270
+ Convert cell with formatting to HTML-tagged text.
271
+ Uses the same format as memoQ handler: <b>, <i>, <u> tags.
272
+ """
273
+ result_parts = []
274
+
275
+ for paragraph in cell.paragraphs:
276
+ for run in paragraph.runs:
277
+ text = run.text
278
+ if not text:
279
+ continue
280
+
281
+ # Determine which tags to apply
282
+ is_bold = run.bold == True
283
+ is_italic = run.italic == True
284
+ is_underline = run.underline == True
285
+
286
+ # Build tagged text
287
+ if is_bold or is_italic or is_underline:
288
+ # Open tags (order: bold, italic, underline)
289
+ if is_bold:
290
+ text = f"<b>{text}"
291
+ if is_italic:
292
+ text = f"<i>{text}" if not is_bold else text.replace("<b>", "<b><i>", 1)
293
+ if is_underline:
294
+ if is_bold and is_italic:
295
+ text = text.replace("<b><i>", "<b><i><u>", 1)
296
+ elif is_bold:
297
+ text = text.replace("<b>", "<b><u>", 1)
298
+ elif is_italic:
299
+ text = text.replace("<i>", "<i><u>", 1)
300
+ else:
301
+ text = f"<u>{text}"
302
+
303
+ # Close tags (reverse order: underline, italic, bold)
304
+ if is_underline:
305
+ text = f"{text}</u>"
306
+ if is_italic:
307
+ text = f"{text}</i>"
308
+ if is_bold:
309
+ text = f"{text}</b>"
310
+
311
+ result_parts.append(text)
312
+
313
+ return ''.join(result_parts)
314
+
315
+ def _tagged_text_to_runs(self, text: str) -> list:
316
+ """
317
+ Parse text with HTML formatting tags and return a list of runs with formatting info.
318
+ Compatible with Supervertaler's memoQ format.
319
+ """
320
+ import re
321
+
322
+ runs = []
323
+
324
+ # Track current formatting state
325
+ is_bold = False
326
+ is_italic = False
327
+ is_underline = False
328
+
329
+ # Pattern to match opening/closing tags
330
+ tag_pattern = re.compile(r'(</?[biu]>)')
331
+
332
+ # Split text by tags, keeping the tags as delimiters
333
+ parts = tag_pattern.split(text)
334
+
335
+ current_text = ""
336
+
337
+ for part in parts:
338
+ if part == "<b>":
339
+ # Save current run if any
340
+ if current_text:
341
+ runs.append({
342
+ 'text': current_text,
343
+ 'bold': is_bold,
344
+ 'italic': is_italic,
345
+ 'underline': is_underline
346
+ })
347
+ current_text = ""
348
+ is_bold = True
349
+ elif part == "</b>":
350
+ # Save current run if any
351
+ if current_text:
352
+ runs.append({
353
+ 'text': current_text,
354
+ 'bold': is_bold,
355
+ 'italic': is_italic,
356
+ 'underline': is_underline
357
+ })
358
+ current_text = ""
359
+ is_bold = False
360
+ elif part == "<i>":
361
+ if current_text:
362
+ runs.append({
363
+ 'text': current_text,
364
+ 'bold': is_bold,
365
+ 'italic': is_italic,
366
+ 'underline': is_underline
367
+ })
368
+ current_text = ""
369
+ is_italic = True
370
+ elif part == "</i>":
371
+ if current_text:
372
+ runs.append({
373
+ 'text': current_text,
374
+ 'bold': is_bold,
375
+ 'italic': is_italic,
376
+ 'underline': is_underline
377
+ })
378
+ current_text = ""
379
+ is_italic = False
380
+ elif part == "<u>":
381
+ if current_text:
382
+ runs.append({
383
+ 'text': current_text,
384
+ 'bold': is_bold,
385
+ 'italic': is_italic,
386
+ 'underline': is_underline
387
+ })
388
+ current_text = ""
389
+ is_underline = True
390
+ elif part == "</u>":
391
+ if current_text:
392
+ runs.append({
393
+ 'text': current_text,
394
+ 'bold': is_bold,
395
+ 'italic': is_italic,
396
+ 'underline': is_underline
397
+ })
398
+ current_text = ""
399
+ is_underline = False
400
+ else:
401
+ # Regular text
402
+ current_text += part
403
+
404
+ # Don't forget the last run
405
+ if current_text:
406
+ runs.append({
407
+ 'text': current_text,
408
+ 'bold': is_bold,
409
+ 'italic': is_italic,
410
+ 'underline': is_underline
411
+ })
412
+
413
+ return runs
414
+
415
+ def _set_cell_text_with_source_formatting(self, target_cell, text: str, source_cell):
416
+ """
417
+ Set cell text parsing HTML formatting tags.
418
+ This preserves word-level bold, italic, and underline formatting.
419
+ """
420
+ if not target_cell.paragraphs:
421
+ return
422
+
423
+ para = target_cell.paragraphs[0]
424
+
425
+ # Clear existing runs
426
+ for run in list(para.runs):
427
+ run._r.getparent().remove(run._r)
428
+
429
+ # Parse HTML tags and create runs
430
+ runs = self._tagged_text_to_runs(text)
431
+
432
+ for run_info in runs:
433
+ run_text = run_info.get('text', '')
434
+ if not run_text:
435
+ continue
436
+
437
+ run = para.add_run(run_text)
438
+
439
+ # Set xml:space='preserve'
440
+ t_elem = run._r.find(qn('w:t'))
441
+ if t_elem is not None:
442
+ t_elem.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
443
+
444
+ # Apply formatting
445
+ if run_info.get('bold'):
446
+ run.bold = True
447
+ if run_info.get('italic'):
448
+ run.italic = True
449
+ if run_info.get('underline'):
450
+ run.underline = True
451
+
452
+ def save(self, output_path: str = None) -> bool:
453
+ """
454
+ Save the modified document.
455
+
456
+ Args:
457
+ output_path: Path to save to (defaults to original path)
458
+
459
+ Returns:
460
+ bool: True if saved successfully
461
+ """
462
+ try:
463
+ save_path = output_path or self.file_path
464
+ self.doc.save(save_path)
465
+ print(f"Saved Phrase bilingual DOCX: {save_path}")
466
+ return True
467
+ except Exception as e:
468
+ print(f"ERROR saving Phrase DOCX: {e}")
469
+ import traceback
470
+ traceback.print_exc()
471
+ return False
472
+
473
+ def get_segments_for_translation(self) -> List[Tuple[str, str, str]]:
474
+ """
475
+ Get segments that need translation.
476
+
477
+ Returns:
478
+ List of (segment_id, source_text, plain_source) tuples
479
+ """
480
+ result = []
481
+ for seg in self.segments:
482
+ # Include all segments (Phrase doesn't have a clear "Not Translated" status)
483
+ # Users can filter based on status_code if needed
484
+ if not seg.target_text or seg.status_code == "MT":
485
+ result.append((seg.segment_id, seg.source_text, seg.plain_source))
486
+ return result
487
+
488
+
489
+ def detect_phrase_docx(file_path: str) -> bool:
490
+ """
491
+ Detect if a DOCX file is a Phrase bilingual file.
492
+
493
+ Returns:
494
+ bool: True if this appears to be a Phrase bilingual DOCX
495
+ """
496
+ try:
497
+ doc = Document(file_path)
498
+
499
+ if len(doc.tables) < 3:
500
+ return False
501
+
502
+ # Look for content tables with Phrase characteristics:
503
+ # - Many rows (>100)
504
+ # - 7 columns
505
+ # - First cell contains ':' (segment ID format)
506
+ for table in doc.tables:
507
+ if len(table.rows) > 100 and len(table.rows[0].cells) == 7:
508
+ first_cell = table.rows[0].cells[0].text.strip()
509
+ if ':' in first_cell:
510
+ return True
511
+
512
+ return False
513
+
514
+ except Exception as e:
515
+ print(f"Error detecting Phrase DOCX: {e}")
516
+ return False