supervertaler 1.9.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

Files changed (85) hide show
  1. Supervertaler.py +47886 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1878 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1172 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.153.dist-info/METADATA +896 -0
  81. supervertaler-1.9.153.dist-info/RECORD +85 -0
  82. supervertaler-1.9.153.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.153.dist-info/top_level.txt +2 -0
@@ -0,0 +1,779 @@
1
+ """
2
+ Déjà Vu X3 Bilingual RTF Handler
3
+
4
+ This module handles the import and export of Déjà Vu X3 bilingual RTF files.
5
+ Déjà Vu exports bilingual tables in RTF format with a 4-column structure.
6
+
7
+ Format Structure:
8
+ - RTF file with embedded table
9
+ - 4 columns per row:
10
+ 1. Segment ID (7-digit format like 0000049)
11
+ 2. Source text with inline tags
12
+ 3. Target text (empty on export, filled on re-import)
13
+ 4. Comments (usually empty)
14
+ - Rows separated by \\row RTF control word
15
+ - Cells separated by \\cell RTF control word
16
+
17
+ Tag System:
18
+ - Inline tags: {NNNNN} format (e.g., {00108}, {00109})
19
+ - Tags appear in pairs (opening and closing)
20
+ - Tags wrap text: {00108}Vind jouw CS{00109}
21
+ - In RTF, tags are escaped: \\{00108\\}text\\{00109\\}
22
+
23
+ Critical for re-import:
24
+ - RTF structure must be preserved exactly
25
+ - Tags must be retained in translations
26
+ - Segment IDs must not be modified
27
+ """
28
+
29
+ import re
30
+ from pathlib import Path
31
+ from typing import List, Dict, Tuple, Optional
32
+ from dataclasses import dataclass
33
+
34
+
35
+ # RTF special character mappings
36
+ RTF_ESCAPE_MAP = {
37
+ r"\'e9": "é", # e-acute
38
+ r"\'e8": "è", # e-grave
39
+ r"\'ea": "ê", # e-circumflex
40
+ r"\'eb": "ë", # e-diaeresis
41
+ r"\'e0": "à", # a-grave
42
+ r"\'e1": "á", # a-acute
43
+ r"\'e2": "â", # a-circumflex
44
+ r"\'e4": "ä", # a-diaeresis
45
+ r"\'e3": "ã", # a-tilde
46
+ r"\'f2": "ò", # o-grave
47
+ r"\'f3": "ó", # o-acute
48
+ r"\'f4": "ô", # o-circumflex
49
+ r"\'f6": "ö", # o-diaeresis
50
+ r"\'f5": "õ", # o-tilde
51
+ r"\'fa": "ú", # u-acute
52
+ r"\'f9": "ù", # u-grave
53
+ r"\'fb": "û", # u-circumflex
54
+ r"\'fc": "ü", # u-diaeresis
55
+ r"\'ec": "ì", # i-grave
56
+ r"\'ed": "í", # i-acute
57
+ r"\'ee": "î", # i-circumflex
58
+ r"\'ef": "ï", # i-diaeresis
59
+ r"\'f1": "ñ", # n-tilde
60
+ r"\'e7": "ç", # c-cedilla
61
+ r"\'df": "ß", # German sharp s
62
+ r"\'c9": "É", # E-acute
63
+ r"\'c8": "È", # E-grave
64
+ r"\'c0": "À", # A-grave
65
+ r"\'c1": "Á", # A-acute
66
+ r"\'d3": "Ó", # O-acute
67
+ r"\'da": "Ú", # U-acute
68
+ r"\'d1": "Ñ", # N-tilde
69
+ r"\'ab": "«", # left guillemet
70
+ r"\'bb": "»", # right guillemet
71
+ r"\'b0": "°", # degree
72
+ r"\'96": "–", # en-dash
73
+ r"\'97": "—", # em-dash
74
+ r"\'92": "'", # right single quote
75
+ r"\'93": """, # left double quote
76
+ r"\'94": """, # right double quote
77
+ r"\'85": "…", # ellipsis
78
+ r"\'a0": " ", # non-breaking space
79
+ }
80
+
81
+ # Déjà Vu tag pattern: {NNNNN} where N is a digit
82
+ DEJAVU_TAG_PATTERN = re.compile(r'\{(\d{5})\}')
83
+
84
+ # Language code mapping (RTF uses Windows LCID codes)
85
+ RTF_LANG_CODES = {
86
+ # Western European
87
+ 1033: "English",
88
+ 2057: "English (UK)",
89
+ 3081: "English (AU)",
90
+ 4105: "English (CA)",
91
+ 1043: "Dutch",
92
+ 2067: "Dutch (BE)",
93
+ 1031: "German",
94
+ 2055: "German (CH)",
95
+ 3079: "German (AT)",
96
+ 1036: "French",
97
+ 2060: "French (BE)",
98
+ 3084: "French (CA)",
99
+ 4108: "French (CH)",
100
+ 3082: "Spanish",
101
+ 1034: "Spanish (Traditional)",
102
+ 2058: "Spanish (MX)",
103
+ 1040: "Italian",
104
+ 2064: "Italian (CH)",
105
+ 1046: "Portuguese (BR)",
106
+ 2070: "Portuguese (PT)",
107
+ # Nordic
108
+ 1030: "Danish",
109
+ 1035: "Finnish",
110
+ 1044: "Norwegian",
111
+ 2068: "Norwegian (Nynorsk)",
112
+ 1053: "Swedish",
113
+ 1039: "Icelandic",
114
+ # Eastern European
115
+ 1045: "Polish",
116
+ 1029: "Czech",
117
+ 1051: "Slovak",
118
+ 1038: "Hungarian",
119
+ 1048: "Romanian",
120
+ 1026: "Bulgarian",
121
+ 1050: "Croatian",
122
+ 2074: "Serbian (Latin)",
123
+ 3098: "Serbian (Cyrillic)",
124
+ 1060: "Slovenian",
125
+ 1058: "Ukrainian",
126
+ 1049: "Russian",
127
+ 1059: "Belarusian",
128
+ 1063: "Lithuanian",
129
+ 1062: "Latvian",
130
+ 1061: "Estonian",
131
+ # Asian
132
+ 2052: "Chinese (Simplified)",
133
+ 1028: "Chinese (Traditional)",
134
+ 1041: "Japanese",
135
+ 1042: "Korean",
136
+ 1054: "Thai",
137
+ 1066: "Vietnamese",
138
+ 1057: "Indonesian",
139
+ 1086: "Malay",
140
+ # Middle Eastern
141
+ 1037: "Hebrew",
142
+ 1025: "Arabic",
143
+ 2049: "Arabic (Iraq)",
144
+ 1065: "Persian",
145
+ 1055: "Turkish",
146
+ 1032: "Greek",
147
+ # Other
148
+ 1027: "Catalan",
149
+ 1069: "Basque",
150
+ 1110: "Galician",
151
+ 1024: "Neutral", # System default
152
+ }
153
+
154
+
155
+
156
+ @dataclass
157
+ class DejaVuSegment:
158
+ """
159
+ Represents a Déjà Vu segment with tag information.
160
+ """
161
+ segment_id: str # 7-digit ID like "0000049"
162
+ source_text: str # Source text with Déjà Vu tags
163
+ target_text: str = "" # Target text (empty on import)
164
+ comment: str = "" # Comment column
165
+ row_index: int = 0 # Row index in RTF for export
166
+
167
+ @property
168
+ def tags(self) -> List[str]:
169
+ """Extract all Déjà Vu tag numbers from source text."""
170
+ return DEJAVU_TAG_PATTERN.findall(self.source_text)
171
+
172
+ @property
173
+ def plain_source(self) -> str:
174
+ """Get source text without tags for translation."""
175
+ return DEJAVU_TAG_PATTERN.sub('', self.source_text).strip()
176
+
177
+ def __repr__(self):
178
+ preview = self.source_text[:50] + "..." if len(self.source_text) > 50 else self.source_text
179
+ return f"DejaVuSegment(id={self.segment_id}, source='{preview}')"
180
+
181
+
182
+ class DejaVuRTFHandler:
183
+ """
184
+ Handler for Déjà Vu X3 bilingual RTF files.
185
+
186
+ This class provides methods to:
187
+ - Load and parse Déjà Vu bilingual RTF files
188
+ - Extract source segments with tag markers
189
+ - Update target segments with translations
190
+ - Save modified files ready for re-import to Déjà Vu
191
+ """
192
+
193
+ def __init__(self):
194
+ self.raw_rtf: str = "" # Original RTF content
195
+ self.segments: List[DejaVuSegment] = []
196
+ self.file_path: Optional[str] = None
197
+ self.source_lang: str = "Dutch"
198
+ self.target_lang: str = "Spanish"
199
+ self._cell_positions: List[Tuple[int, int, int, int]] = [] # (row_idx, seg_id_start, source_start, source_end, target_start, target_end)
200
+
201
+ def load(self, file_path: str) -> bool:
202
+ """
203
+ Load a Déjà Vu bilingual RTF file.
204
+
205
+ Args:
206
+ file_path: Path to the Déjà Vu bilingual RTF file
207
+
208
+ Returns:
209
+ bool: True if loaded successfully, False otherwise
210
+ """
211
+ try:
212
+ self.file_path = file_path
213
+
214
+ # Read RTF content
215
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
216
+ self.raw_rtf = f.read()
217
+
218
+ # Detect languages from RTF
219
+ self._detect_languages()
220
+
221
+ # Parse segments
222
+ self._parse_segments()
223
+
224
+ print(f"Successfully loaded Deja Vu RTF: {file_path}")
225
+ print(f"Languages: {self.source_lang} -> {self.target_lang}")
226
+ print(f"Total segments: {len(self.segments)}")
227
+
228
+ return True
229
+
230
+ except Exception as e:
231
+ print(f"ERROR loading Deja Vu RTF: {e}")
232
+ import traceback
233
+ traceback.print_exc()
234
+ return False
235
+
236
+ def _detect_languages(self):
237
+ """Detect source and target languages from RTF content."""
238
+ # Look for language codes in RTF header or content
239
+ # Pattern: \langNNNN or \langnpNNNN
240
+ lang_matches = re.findall(r'\\lang(?:np)?(\d+)', self.raw_rtf)
241
+
242
+ if lang_matches:
243
+ # Count occurrences to find the main languages
244
+ from collections import Counter
245
+ code_counts = Counter(int(m) for m in lang_matches)
246
+
247
+ # Get the two most common language codes (excluding 1024 which is neutral)
248
+ main_codes = [code for code, _ in code_counts.most_common() if code != 1024]
249
+
250
+ if len(main_codes) >= 2:
251
+ # In Déjà Vu bilingual, typically source appears less than target
252
+ # because target column may have formatting placeholders
253
+ code1, code2 = main_codes[0], main_codes[1]
254
+
255
+ # Map codes to languages
256
+ lang1 = RTF_LANG_CODES.get(code1, f"Unknown ({code1})")
257
+ lang2 = RTF_LANG_CODES.get(code2, f"Unknown ({code2})")
258
+
259
+ # Heuristic: the less frequent one is likely source (content)
260
+ # the more frequent one is target (includes empty cell formatting)
261
+ count1 = code_counts[code1]
262
+ count2 = code_counts[code2]
263
+
264
+ if count1 > count2:
265
+ # code1 is more frequent, likely target
266
+ self.source_lang = lang2
267
+ self.target_lang = lang1
268
+ else:
269
+ self.source_lang = lang1
270
+ self.target_lang = lang2
271
+
272
+ elif len(main_codes) == 1:
273
+ code = main_codes[0]
274
+ self.source_lang = RTF_LANG_CODES.get(code, f"Unknown ({code})")
275
+
276
+ def _decode_rtf_text(self, text: str) -> str:
277
+ """Decode RTF escape sequences to plain text."""
278
+ result = text
279
+
280
+ # Replace RTF special character codes
281
+ for rtf_code, char in RTF_ESCAPE_MAP.items():
282
+ result = result.replace(rtf_code, char)
283
+
284
+ # Handle Unicode escapes (\uNNNNN?)
285
+ def replace_unicode(match):
286
+ code = int(match.group(1))
287
+ if code < 0:
288
+ code = 65536 + code # Handle negative values
289
+ return chr(code)
290
+
291
+ result = re.sub(r'\\u(-?\d+)\?', replace_unicode, result)
292
+
293
+ # Unescape RTF special characters
294
+ result = result.replace(r'\{', '{')
295
+ result = result.replace(r'\}', '}')
296
+ result = result.replace(r'\\', '\\')
297
+
298
+ # Remove RTF control words that might remain (but keep content)
299
+ # Be careful not to remove too much
300
+ result = re.sub(r'\\[a-z]+\d*\s?', '', result)
301
+
302
+ # Clean up multiple spaces
303
+ result = re.sub(r' +', ' ', result)
304
+
305
+ return result.strip()
306
+
307
+ def _parse_segments(self):
308
+ """Parse RTF content to extract segments."""
309
+ self.segments = []
310
+
311
+ # RTF table structure uses \cell to separate cells and \row to end rows
312
+ # We need to find table rows containing segment data
313
+
314
+ # Split by \row to get table rows
315
+ # But \row appears with various suffixes, so be flexible
316
+ row_pattern = re.compile(r'\\row\b')
317
+
318
+ # Find all content between table cells
319
+ # Pattern: look for 7-digit segment ID followed by cell marker, then source, target, comment
320
+
321
+ # More robust approach: extract text between \cell markers
322
+ # Each row has: ID \cell Source \cell Target \cell Comment \cell
323
+
324
+ # Find the actual table content (between table start and end)
325
+ # The table rows follow the pattern with segment IDs like 0000049
326
+
327
+ segment_pattern = re.compile(
328
+ r'(\d{7})' # Segment ID (7 digits)
329
+ r'[^\\]*\\cell\s*\}' # After ID, find \cell
330
+ r'[^}]*\{[^}]*' # Skip formatting
331
+ r'([^\\]*(?:\\[^c][^\\]*)*)' # Source text (until next \cell)
332
+ r'\\cell\s*' # Cell separator
333
+ r'(.*?)' # Target text (empty or filled)
334
+ r'\\cell\s*' # Cell separator
335
+ r'(.*?)' # Comment
336
+ r'\\cell', # Final cell separator
337
+ re.DOTALL
338
+ )
339
+
340
+ # Simpler approach: find all occurrences of segment IDs followed by cell content
341
+ # Pattern: find 7-digit numbers that look like segment IDs
342
+
343
+ # Look for the actual segment pattern in RTF
344
+ # The content shows: 0000172}...source text...\cell \cell \cell
345
+ # This means: ID, source, empty target, empty comment
346
+
347
+ # Let's use a different approach - find segments by looking for the pattern
348
+ # of 7-digit ID followed by \cell, then text, then \cell \cell \cell
349
+
350
+ # Extract raw cell content using simpler pattern
351
+ # Split by \row to get rows first
352
+
353
+ # Find all text that appears between RTF formatting codes
354
+ # after a 7-digit segment ID
355
+
356
+ # Working pattern based on RTF structure observed:
357
+ # - ID appears as just digits after formatting codes
358
+ # - Then \cell (end of ID cell)
359
+ # - Then source content with embedded formatting and text
360
+ # - Then \cell (end of source cell)
361
+ # - Then \cell (end of empty target cell)
362
+ # - Then \cell (end of empty comment cell)
363
+
364
+ # Simplified extraction: find all 7-digit segment IDs and the text that follows
365
+ current_pos = 0
366
+ rtf = self.raw_rtf
367
+
368
+ # Find segment ID pattern in RTF context
369
+ # Looking for pattern like: ...insrsid9000367 0000172}... (ID followed by })
370
+ id_pattern = re.compile(r'(?:insrsid\d+\s+)(\d{7})\}')
371
+
372
+ for match in id_pattern.finditer(rtf):
373
+ segment_id = match.group(1)
374
+ start_pos = match.end()
375
+
376
+ # Find the source text - it's between the ID and the next \cell markers
377
+ # The pattern after ID is: {formatting}\cell }{formatting}source text}...
378
+
379
+ # Look for text content in the next cell (source cell)
380
+ # Skip to after first \cell (end of ID cell)
381
+ cell_pattern = re.compile(r'\\cell\s*')
382
+ cell_match = cell_pattern.search(rtf, start_pos)
383
+
384
+ if not cell_match:
385
+ continue
386
+
387
+ source_start = cell_match.end()
388
+
389
+ # Find the next \cell (end of source cell)
390
+ # But we need to extract the actual text content, not RTF codes
391
+
392
+ # Find the next 3 \cell markers (source, target, comment)
393
+ cells_remaining = 3
394
+ search_pos = source_start
395
+ cell_positions = []
396
+
397
+ for _ in range(cells_remaining):
398
+ cell_match = cell_pattern.search(rtf, search_pos)
399
+ if cell_match:
400
+ cell_positions.append((search_pos, cell_match.start()))
401
+ search_pos = cell_match.end()
402
+
403
+ if len(cell_positions) >= 3:
404
+ # Extract source text from first cell region
405
+ source_region = rtf[cell_positions[0][0]:cell_positions[0][1]]
406
+ source_text = self._extract_text_from_rtf_region(source_region)
407
+
408
+ # Extract target text from second cell region (usually empty)
409
+ target_region = rtf[cell_positions[1][0]:cell_positions[1][1]]
410
+ target_text = self._extract_text_from_rtf_region(target_region)
411
+
412
+ # Extract comment from third cell region
413
+ comment_region = rtf[cell_positions[2][0]:cell_positions[2][1]]
414
+ comment_text = self._extract_text_from_rtf_region(comment_region)
415
+
416
+ if source_text: # Only add if we have source text
417
+ segment = DejaVuSegment(
418
+ segment_id=segment_id,
419
+ source_text=source_text,
420
+ target_text=target_text,
421
+ comment=comment_text,
422
+ row_index=len(self.segments)
423
+ )
424
+ self.segments.append(segment)
425
+
426
+ def _extract_text_from_rtf_region(self, region: str) -> str:
427
+ """Extract plain text from an RTF region."""
428
+ # Remove nested braces and their contents (formatting groups)
429
+ # but keep the actual text content
430
+
431
+ result = []
432
+ depth = 0
433
+ i = 0
434
+ text_buffer = []
435
+
436
+ while i < len(region):
437
+ char = region[i]
438
+
439
+ if char == '{':
440
+ depth += 1
441
+ elif char == '}':
442
+ depth -= 1
443
+ elif char == '\\':
444
+ # Handle escape sequences
445
+ if i + 1 < len(region):
446
+ next_char = region[i + 1]
447
+ if next_char == '{':
448
+ text_buffer.append('{')
449
+ i += 2
450
+ continue
451
+ elif next_char == '}':
452
+ text_buffer.append('}')
453
+ i += 2
454
+ continue
455
+ elif next_char == '\\':
456
+ text_buffer.append('\\')
457
+ i += 2
458
+ continue
459
+ elif next_char == "'":
460
+ # Hex character code
461
+ if i + 3 < len(region):
462
+ hex_code = region[i:i+4]
463
+ if hex_code in RTF_ESCAPE_MAP:
464
+ text_buffer.append(RTF_ESCAPE_MAP[hex_code])
465
+ i += 4
466
+ continue
467
+ # Skip control word
468
+ j = i + 1
469
+ while j < len(region) and (region[j].isalpha() or region[j].isdigit() or region[j] == '-'):
470
+ j += 1
471
+ if j < len(region) and region[j] == ' ':
472
+ j += 1 # Skip trailing space
473
+ i = j
474
+ continue
475
+ elif depth == 0 or True: # Collect text at any depth
476
+ # Only collect if not whitespace after control word
477
+ if char not in '{}\\\r\n':
478
+ text_buffer.append(char)
479
+
480
+ i += 1
481
+
482
+ text = ''.join(text_buffer)
483
+
484
+ # Clean up: remove excessive whitespace
485
+ text = re.sub(r'\s+', ' ', text)
486
+ text = text.strip()
487
+
488
+ return text
489
+
490
+ def extract_source_segments(self) -> List[DejaVuSegment]:
491
+ """
492
+ Extract all source segments from the Déjà Vu bilingual RTF.
493
+
494
+ Returns:
495
+ list: List of DejaVuSegment objects
496
+ """
497
+ return self.segments.copy()
498
+
499
+ def get_source_texts(self) -> List[str]:
500
+ """Get list of source texts for translation."""
501
+ return [seg.source_text for seg in self.segments]
502
+
503
+ def get_target_texts(self) -> List[str]:
504
+ """Get list of target texts (may be empty)."""
505
+ return [seg.target_text for seg in self.segments]
506
+
507
+ def update_translations(self, translations: Dict[str, str]) -> int:
508
+ """
509
+ Update target segments with translations.
510
+
511
+ Args:
512
+ translations: Dict mapping segment_id to translated text
513
+
514
+ Returns:
515
+ int: Number of segments updated
516
+ """
517
+ updated_count = 0
518
+
519
+ for segment in self.segments:
520
+ if segment.segment_id in translations:
521
+ segment.target_text = translations[segment.segment_id]
522
+ updated_count += 1
523
+
524
+ print(f"Updated {updated_count} target segments")
525
+ return updated_count
526
+
527
+ def update_translations_by_index(self, translations: Dict[int, str]) -> int:
528
+ """
529
+ Update target segments with translations by row index.
530
+
531
+ Args:
532
+ translations: Dict mapping row_index to translated text
533
+
534
+ Returns:
535
+ int: Number of segments updated
536
+ """
537
+ updated_count = 0
538
+
539
+ for segment in self.segments:
540
+ if segment.row_index in translations:
541
+ segment.target_text = translations[segment.row_index]
542
+ updated_count += 1
543
+
544
+ print(f"Updated {updated_count} target segments by index")
545
+ return updated_count
546
+
547
+ def save(self, output_path: str) -> bool:
548
+ """
549
+ Save the RTF file with updated translations.
550
+
551
+ This method modifies the RTF by inserting translations into
552
+ the target column cells while preserving the RTF structure.
553
+
554
+ Args:
555
+ output_path: Path for the output RTF file
556
+
557
+ Returns:
558
+ bool: True if saved successfully, False otherwise
559
+ """
560
+ try:
561
+ # Create translation map
562
+ translation_map = {seg.segment_id: seg.target_text for seg in self.segments if seg.target_text}
563
+
564
+ if not translation_map:
565
+ print("WARNING: No translations to save")
566
+ # Still save the file as-is
567
+ with open(output_path, 'w', encoding='utf-8') as f:
568
+ f.write(self.raw_rtf)
569
+ return True
570
+
571
+ # Modify RTF to insert translations
572
+ modified_rtf = self._insert_translations(translation_map)
573
+
574
+ # Save modified RTF
575
+ with open(output_path, 'w', encoding='utf-8') as f:
576
+ f.write(modified_rtf)
577
+
578
+ print(f"Saved Déjà Vu RTF to: {output_path}")
579
+ return True
580
+
581
+ except Exception as e:
582
+ print(f"ERROR saving Déjà Vu RTF: {e}")
583
+ import traceback
584
+ traceback.print_exc()
585
+ return False
586
+
587
+ def _encode_text_for_rtf(self, text: str) -> str:
588
+ """Encode text for RTF format."""
589
+ result = []
590
+
591
+ for char in text:
592
+ code = ord(char)
593
+ if code > 127:
594
+ # Non-ASCII: use Unicode escape
595
+ result.append(f'\\u{code}?')
596
+ elif char == '{':
597
+ result.append('\\{')
598
+ elif char == '}':
599
+ result.append('\\}')
600
+ elif char == '\\':
601
+ result.append('\\\\')
602
+ elif char == '\n':
603
+ result.append('\\par ')
604
+ elif char == '\r':
605
+ pass # Skip carriage returns
606
+ else:
607
+ result.append(char)
608
+
609
+ return ''.join(result)
610
+
611
+ def _insert_translations(self, translations: Dict[str, str]) -> str:
612
+ """
613
+ Insert translations into the RTF content.
614
+
615
+ This finds the target cell for each segment and inserts the translation.
616
+ The Déjà Vu RTF format has empty target cells that look like:
617
+ \\cell \\cell (two consecutive \\cell markers with nothing between)
618
+
619
+ We insert plain text just before the third \\cell marker.
620
+ """
621
+ rtf = self.raw_rtf
622
+
623
+ # Pattern to find segment rows by their 7-digit ID
624
+ id_pattern = re.compile(r'(?:insrsid\d+\s+)(\d{7})\}')
625
+ cell_pattern = re.compile(r'\\cell\s*')
626
+
627
+ # Collect modifications (position, replacement_text)
628
+ modifications = []
629
+
630
+ for match in id_pattern.finditer(rtf):
631
+ segment_id = match.group(1)
632
+
633
+ if segment_id not in translations:
634
+ continue
635
+
636
+ translation = translations[segment_id]
637
+ if not translation:
638
+ continue
639
+
640
+ start_pos = match.end()
641
+
642
+ # Find cells after the ID:
643
+ # Cell 1: end of ID cell
644
+ # Cell 2: end of source cell
645
+ # Cell 3: end of target cell (we insert BEFORE this)
646
+
647
+ cell1 = cell_pattern.search(rtf, start_pos)
648
+ if not cell1:
649
+ continue
650
+
651
+ cell2 = cell_pattern.search(rtf, cell1.end())
652
+ if not cell2:
653
+ continue
654
+
655
+ cell3 = cell_pattern.search(rtf, cell2.end())
656
+ if not cell3:
657
+ continue
658
+
659
+ # Insert position is right after cell2 (before cell3)
660
+ insert_pos = cell2.end()
661
+
662
+ # Encode the translation for RTF
663
+ encoded_translation = self._encode_text_for_rtf(translation)
664
+
665
+ # Get target language code for RTF
666
+ target_lang_code = self._get_rtf_lang_code(self.target_lang) or 3082
667
+
668
+ # Build simple RTF-formatted text
669
+ # Format: {formatting}text{} - properly balanced braces
670
+ replacement = (
671
+ f'{{\\rtlch\\fcs1 \\af37 \\ltrch\\fcs0 '
672
+ f'\\f37\\lang{target_lang_code}\\langfe{target_lang_code}'
673
+ f'\\langnp{target_lang_code} {encoded_translation}}}'
674
+ )
675
+
676
+ modifications.append((insert_pos, replacement))
677
+
678
+ # Apply modifications from end to start to preserve positions
679
+ modifications.sort(key=lambda x: x[0], reverse=True)
680
+
681
+ for insert_pos, replacement in modifications:
682
+ rtf = rtf[:insert_pos] + replacement + rtf[insert_pos:]
683
+
684
+ return rtf
685
+
686
+ def _get_rtf_lang_code(self, lang_name: str) -> Optional[int]:
687
+ """Get RTF language code from language name."""
688
+ for code, name in RTF_LANG_CODES.items():
689
+ if name.lower() == lang_name.lower():
690
+ return code
691
+ return None
692
+
693
+ def get_segment_by_id(self, segment_id: str) -> Optional[DejaVuSegment]:
694
+ """Get a segment by its ID."""
695
+ for segment in self.segments:
696
+ if segment.segment_id == segment_id:
697
+ return segment
698
+ return None
699
+
700
+ def get_segment_count(self) -> int:
701
+ """Get the number of segments."""
702
+ return len(self.segments)
703
+
704
+ def has_translations(self) -> bool:
705
+ """Check if any segments have translations."""
706
+ return any(seg.target_text for seg in self.segments)
707
+
708
+
709
+ def extract_dejavu_tags(text: str) -> List[str]:
710
+ """
711
+ Extract Déjà Vu tag numbers from text.
712
+
713
+ Args:
714
+ text: Text containing Déjà Vu tags
715
+
716
+ Returns:
717
+ List of tag numbers (5-digit strings)
718
+ """
719
+ return DEJAVU_TAG_PATTERN.findall(text)
720
+
721
+
722
+ def strip_dejavu_tags(text: str) -> str:
723
+ """
724
+ Remove Déjà Vu tags from text.
725
+
726
+ Args:
727
+ text: Text containing Déjà Vu tags
728
+
729
+ Returns:
730
+ Text with tags removed
731
+ """
732
+ return DEJAVU_TAG_PATTERN.sub('', text).strip()
733
+
734
+
735
+ def validate_dejavu_tags(source: str, target: str) -> Tuple[bool, List[str]]:
736
+ """
737
+ Validate that target contains all tags from source.
738
+
739
+ Args:
740
+ source: Source text with tags
741
+ target: Target text that should contain same tags
742
+
743
+ Returns:
744
+ Tuple of (is_valid, list of missing tags)
745
+ """
746
+ source_tags = set(extract_dejavu_tags(source))
747
+ target_tags = set(extract_dejavu_tags(target))
748
+
749
+ missing = source_tags - target_tags
750
+ extra = target_tags - source_tags
751
+
752
+ is_valid = len(missing) == 0 and len(extra) == 0
753
+
754
+ issues = []
755
+ if missing:
756
+ issues.extend([f"Missing: {{{t}}}" for t in missing])
757
+ if extra:
758
+ issues.extend([f"Extra: {{{t}}}" for t in extra])
759
+
760
+ return is_valid, issues
761
+
762
+
763
+ # Test function
764
+ if __name__ == "__main__":
765
+ import sys
766
+
767
+ if len(sys.argv) < 2:
768
+ print("Usage: python dejavurtf_handler.py <path_to_rtf>")
769
+ sys.exit(1)
770
+
771
+ handler = DejaVuRTFHandler()
772
+ if handler.load(sys.argv[1]):
773
+ segments = handler.extract_source_segments()
774
+ print(f"\nExtracted {len(segments)} segments:")
775
+ for i, seg in enumerate(segments[:10]): # Show first 10
776
+ print(f" [{seg.segment_id}] {seg.source_text[:60]}...")
777
+
778
+ if len(segments) > 10:
779
+ print(f" ... and {len(segments) - 10} more segments")