supervertaler 1.9.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +47886 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1878 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +333 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1172 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.153.dist-info/METADATA +896 -0
- supervertaler-1.9.153.dist-info/RECORD +85 -0
- supervertaler-1.9.153.dist-info/WHEEL +5 -0
- supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.153.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,779 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Déjà Vu X3 Bilingual RTF Handler
|
|
3
|
+
|
|
4
|
+
This module handles the import and export of Déjà Vu X3 bilingual RTF files.
|
|
5
|
+
Déjà Vu exports bilingual tables in RTF format with a 4-column structure.
|
|
6
|
+
|
|
7
|
+
Format Structure:
|
|
8
|
+
- RTF file with embedded table
|
|
9
|
+
- 4 columns per row:
|
|
10
|
+
1. Segment ID (7-digit format like 0000049)
|
|
11
|
+
2. Source text with inline tags
|
|
12
|
+
3. Target text (empty on export, filled on re-import)
|
|
13
|
+
4. Comments (usually empty)
|
|
14
|
+
- Rows separated by \\row RTF control word
|
|
15
|
+
- Cells separated by \\cell RTF control word
|
|
16
|
+
|
|
17
|
+
Tag System:
|
|
18
|
+
- Inline tags: {NNNNN} format (e.g., {00108}, {00109})
|
|
19
|
+
- Tags appear in pairs (opening and closing)
|
|
20
|
+
- Tags wrap text: {00108}Vind jouw CS{00109}
|
|
21
|
+
- In RTF, tags are escaped: \\{00108\\}text\\{00109\\}
|
|
22
|
+
|
|
23
|
+
Critical for re-import:
|
|
24
|
+
- RTF structure must be preserved exactly
|
|
25
|
+
- Tags must be retained in translations
|
|
26
|
+
- Segment IDs must not be modified
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import re
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import List, Dict, Tuple, Optional
|
|
32
|
+
from dataclasses import dataclass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# RTF special character mappings
|
|
36
|
+
RTF_ESCAPE_MAP = {
|
|
37
|
+
r"\'e9": "é", # e-acute
|
|
38
|
+
r"\'e8": "è", # e-grave
|
|
39
|
+
r"\'ea": "ê", # e-circumflex
|
|
40
|
+
r"\'eb": "ë", # e-diaeresis
|
|
41
|
+
r"\'e0": "à", # a-grave
|
|
42
|
+
r"\'e1": "á", # a-acute
|
|
43
|
+
r"\'e2": "â", # a-circumflex
|
|
44
|
+
r"\'e4": "ä", # a-diaeresis
|
|
45
|
+
r"\'e3": "ã", # a-tilde
|
|
46
|
+
r"\'f2": "ò", # o-grave
|
|
47
|
+
r"\'f3": "ó", # o-acute
|
|
48
|
+
r"\'f4": "ô", # o-circumflex
|
|
49
|
+
r"\'f6": "ö", # o-diaeresis
|
|
50
|
+
r"\'f5": "õ", # o-tilde
|
|
51
|
+
r"\'fa": "ú", # u-acute
|
|
52
|
+
r"\'f9": "ù", # u-grave
|
|
53
|
+
r"\'fb": "û", # u-circumflex
|
|
54
|
+
r"\'fc": "ü", # u-diaeresis
|
|
55
|
+
r"\'ec": "ì", # i-grave
|
|
56
|
+
r"\'ed": "í", # i-acute
|
|
57
|
+
r"\'ee": "î", # i-circumflex
|
|
58
|
+
r"\'ef": "ï", # i-diaeresis
|
|
59
|
+
r"\'f1": "ñ", # n-tilde
|
|
60
|
+
r"\'e7": "ç", # c-cedilla
|
|
61
|
+
r"\'df": "ß", # German sharp s
|
|
62
|
+
r"\'c9": "É", # E-acute
|
|
63
|
+
r"\'c8": "È", # E-grave
|
|
64
|
+
r"\'c0": "À", # A-grave
|
|
65
|
+
r"\'c1": "Á", # A-acute
|
|
66
|
+
r"\'d3": "Ó", # O-acute
|
|
67
|
+
r"\'da": "Ú", # U-acute
|
|
68
|
+
r"\'d1": "Ñ", # N-tilde
|
|
69
|
+
r"\'ab": "«", # left guillemet
|
|
70
|
+
r"\'bb": "»", # right guillemet
|
|
71
|
+
r"\'b0": "°", # degree
|
|
72
|
+
r"\'96": "–", # en-dash
|
|
73
|
+
r"\'97": "—", # em-dash
|
|
74
|
+
r"\'92": "'", # right single quote
|
|
75
|
+
r"\'93": """, # left double quote
|
|
76
|
+
r"\'94": """, # right double quote
|
|
77
|
+
r"\'85": "…", # ellipsis
|
|
78
|
+
r"\'a0": " ", # non-breaking space
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# Déjà Vu tag pattern: {NNNNN} where N is a digit
|
|
82
|
+
DEJAVU_TAG_PATTERN = re.compile(r'\{(\d{5})\}')
|
|
83
|
+
|
|
84
|
+
# Language code mapping (RTF uses Windows LCID codes)
|
|
85
|
+
RTF_LANG_CODES = {
|
|
86
|
+
# Western European
|
|
87
|
+
1033: "English",
|
|
88
|
+
2057: "English (UK)",
|
|
89
|
+
3081: "English (AU)",
|
|
90
|
+
4105: "English (CA)",
|
|
91
|
+
1043: "Dutch",
|
|
92
|
+
2067: "Dutch (BE)",
|
|
93
|
+
1031: "German",
|
|
94
|
+
2055: "German (CH)",
|
|
95
|
+
3079: "German (AT)",
|
|
96
|
+
1036: "French",
|
|
97
|
+
2060: "French (BE)",
|
|
98
|
+
3084: "French (CA)",
|
|
99
|
+
4108: "French (CH)",
|
|
100
|
+
3082: "Spanish",
|
|
101
|
+
1034: "Spanish (Traditional)",
|
|
102
|
+
2058: "Spanish (MX)",
|
|
103
|
+
1040: "Italian",
|
|
104
|
+
2064: "Italian (CH)",
|
|
105
|
+
1046: "Portuguese (BR)",
|
|
106
|
+
2070: "Portuguese (PT)",
|
|
107
|
+
# Nordic
|
|
108
|
+
1030: "Danish",
|
|
109
|
+
1035: "Finnish",
|
|
110
|
+
1044: "Norwegian",
|
|
111
|
+
2068: "Norwegian (Nynorsk)",
|
|
112
|
+
1053: "Swedish",
|
|
113
|
+
1039: "Icelandic",
|
|
114
|
+
# Eastern European
|
|
115
|
+
1045: "Polish",
|
|
116
|
+
1029: "Czech",
|
|
117
|
+
1051: "Slovak",
|
|
118
|
+
1038: "Hungarian",
|
|
119
|
+
1048: "Romanian",
|
|
120
|
+
1026: "Bulgarian",
|
|
121
|
+
1050: "Croatian",
|
|
122
|
+
2074: "Serbian (Latin)",
|
|
123
|
+
3098: "Serbian (Cyrillic)",
|
|
124
|
+
1060: "Slovenian",
|
|
125
|
+
1058: "Ukrainian",
|
|
126
|
+
1049: "Russian",
|
|
127
|
+
1059: "Belarusian",
|
|
128
|
+
1063: "Lithuanian",
|
|
129
|
+
1062: "Latvian",
|
|
130
|
+
1061: "Estonian",
|
|
131
|
+
# Asian
|
|
132
|
+
2052: "Chinese (Simplified)",
|
|
133
|
+
1028: "Chinese (Traditional)",
|
|
134
|
+
1041: "Japanese",
|
|
135
|
+
1042: "Korean",
|
|
136
|
+
1054: "Thai",
|
|
137
|
+
1066: "Vietnamese",
|
|
138
|
+
1057: "Indonesian",
|
|
139
|
+
1086: "Malay",
|
|
140
|
+
# Middle Eastern
|
|
141
|
+
1037: "Hebrew",
|
|
142
|
+
1025: "Arabic",
|
|
143
|
+
2049: "Arabic (Iraq)",
|
|
144
|
+
1065: "Persian",
|
|
145
|
+
1055: "Turkish",
|
|
146
|
+
1032: "Greek",
|
|
147
|
+
# Other
|
|
148
|
+
1027: "Catalan",
|
|
149
|
+
1069: "Basque",
|
|
150
|
+
1110: "Galician",
|
|
151
|
+
1024: "Neutral", # System default
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@dataclass
|
|
157
|
+
class DejaVuSegment:
|
|
158
|
+
"""
|
|
159
|
+
Represents a Déjà Vu segment with tag information.
|
|
160
|
+
"""
|
|
161
|
+
segment_id: str # 7-digit ID like "0000049"
|
|
162
|
+
source_text: str # Source text with Déjà Vu tags
|
|
163
|
+
target_text: str = "" # Target text (empty on import)
|
|
164
|
+
comment: str = "" # Comment column
|
|
165
|
+
row_index: int = 0 # Row index in RTF for export
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def tags(self) -> List[str]:
|
|
169
|
+
"""Extract all Déjà Vu tag numbers from source text."""
|
|
170
|
+
return DEJAVU_TAG_PATTERN.findall(self.source_text)
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def plain_source(self) -> str:
|
|
174
|
+
"""Get source text without tags for translation."""
|
|
175
|
+
return DEJAVU_TAG_PATTERN.sub('', self.source_text).strip()
|
|
176
|
+
|
|
177
|
+
def __repr__(self):
|
|
178
|
+
preview = self.source_text[:50] + "..." if len(self.source_text) > 50 else self.source_text
|
|
179
|
+
return f"DejaVuSegment(id={self.segment_id}, source='{preview}')"
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class DejaVuRTFHandler:
|
|
183
|
+
"""
|
|
184
|
+
Handler for Déjà Vu X3 bilingual RTF files.
|
|
185
|
+
|
|
186
|
+
This class provides methods to:
|
|
187
|
+
- Load and parse Déjà Vu bilingual RTF files
|
|
188
|
+
- Extract source segments with tag markers
|
|
189
|
+
- Update target segments with translations
|
|
190
|
+
- Save modified files ready for re-import to Déjà Vu
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
def __init__(self):
|
|
194
|
+
self.raw_rtf: str = "" # Original RTF content
|
|
195
|
+
self.segments: List[DejaVuSegment] = []
|
|
196
|
+
self.file_path: Optional[str] = None
|
|
197
|
+
self.source_lang: str = "Dutch"
|
|
198
|
+
self.target_lang: str = "Spanish"
|
|
199
|
+
self._cell_positions: List[Tuple[int, int, int, int]] = [] # (row_idx, seg_id_start, source_start, source_end, target_start, target_end)
|
|
200
|
+
|
|
201
|
+
def load(self, file_path: str) -> bool:
|
|
202
|
+
"""
|
|
203
|
+
Load a Déjà Vu bilingual RTF file.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
file_path: Path to the Déjà Vu bilingual RTF file
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
bool: True if loaded successfully, False otherwise
|
|
210
|
+
"""
|
|
211
|
+
try:
|
|
212
|
+
self.file_path = file_path
|
|
213
|
+
|
|
214
|
+
# Read RTF content
|
|
215
|
+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
|
216
|
+
self.raw_rtf = f.read()
|
|
217
|
+
|
|
218
|
+
# Detect languages from RTF
|
|
219
|
+
self._detect_languages()
|
|
220
|
+
|
|
221
|
+
# Parse segments
|
|
222
|
+
self._parse_segments()
|
|
223
|
+
|
|
224
|
+
print(f"Successfully loaded Deja Vu RTF: {file_path}")
|
|
225
|
+
print(f"Languages: {self.source_lang} -> {self.target_lang}")
|
|
226
|
+
print(f"Total segments: {len(self.segments)}")
|
|
227
|
+
|
|
228
|
+
return True
|
|
229
|
+
|
|
230
|
+
except Exception as e:
|
|
231
|
+
print(f"ERROR loading Deja Vu RTF: {e}")
|
|
232
|
+
import traceback
|
|
233
|
+
traceback.print_exc()
|
|
234
|
+
return False
|
|
235
|
+
|
|
236
|
+
def _detect_languages(self):
|
|
237
|
+
"""Detect source and target languages from RTF content."""
|
|
238
|
+
# Look for language codes in RTF header or content
|
|
239
|
+
# Pattern: \langNNNN or \langnpNNNN
|
|
240
|
+
lang_matches = re.findall(r'\\lang(?:np)?(\d+)', self.raw_rtf)
|
|
241
|
+
|
|
242
|
+
if lang_matches:
|
|
243
|
+
# Count occurrences to find the main languages
|
|
244
|
+
from collections import Counter
|
|
245
|
+
code_counts = Counter(int(m) for m in lang_matches)
|
|
246
|
+
|
|
247
|
+
# Get the two most common language codes (excluding 1024 which is neutral)
|
|
248
|
+
main_codes = [code for code, _ in code_counts.most_common() if code != 1024]
|
|
249
|
+
|
|
250
|
+
if len(main_codes) >= 2:
|
|
251
|
+
# In Déjà Vu bilingual, typically source appears less than target
|
|
252
|
+
# because target column may have formatting placeholders
|
|
253
|
+
code1, code2 = main_codes[0], main_codes[1]
|
|
254
|
+
|
|
255
|
+
# Map codes to languages
|
|
256
|
+
lang1 = RTF_LANG_CODES.get(code1, f"Unknown ({code1})")
|
|
257
|
+
lang2 = RTF_LANG_CODES.get(code2, f"Unknown ({code2})")
|
|
258
|
+
|
|
259
|
+
# Heuristic: the less frequent one is likely source (content)
|
|
260
|
+
# the more frequent one is target (includes empty cell formatting)
|
|
261
|
+
count1 = code_counts[code1]
|
|
262
|
+
count2 = code_counts[code2]
|
|
263
|
+
|
|
264
|
+
if count1 > count2:
|
|
265
|
+
# code1 is more frequent, likely target
|
|
266
|
+
self.source_lang = lang2
|
|
267
|
+
self.target_lang = lang1
|
|
268
|
+
else:
|
|
269
|
+
self.source_lang = lang1
|
|
270
|
+
self.target_lang = lang2
|
|
271
|
+
|
|
272
|
+
elif len(main_codes) == 1:
|
|
273
|
+
code = main_codes[0]
|
|
274
|
+
self.source_lang = RTF_LANG_CODES.get(code, f"Unknown ({code})")
|
|
275
|
+
|
|
276
|
+
def _decode_rtf_text(self, text: str) -> str:
|
|
277
|
+
"""Decode RTF escape sequences to plain text."""
|
|
278
|
+
result = text
|
|
279
|
+
|
|
280
|
+
# Replace RTF special character codes
|
|
281
|
+
for rtf_code, char in RTF_ESCAPE_MAP.items():
|
|
282
|
+
result = result.replace(rtf_code, char)
|
|
283
|
+
|
|
284
|
+
# Handle Unicode escapes (\uNNNNN?)
|
|
285
|
+
def replace_unicode(match):
|
|
286
|
+
code = int(match.group(1))
|
|
287
|
+
if code < 0:
|
|
288
|
+
code = 65536 + code # Handle negative values
|
|
289
|
+
return chr(code)
|
|
290
|
+
|
|
291
|
+
result = re.sub(r'\\u(-?\d+)\?', replace_unicode, result)
|
|
292
|
+
|
|
293
|
+
# Unescape RTF special characters
|
|
294
|
+
result = result.replace(r'\{', '{')
|
|
295
|
+
result = result.replace(r'\}', '}')
|
|
296
|
+
result = result.replace(r'\\', '\\')
|
|
297
|
+
|
|
298
|
+
# Remove RTF control words that might remain (but keep content)
|
|
299
|
+
# Be careful not to remove too much
|
|
300
|
+
result = re.sub(r'\\[a-z]+\d*\s?', '', result)
|
|
301
|
+
|
|
302
|
+
# Clean up multiple spaces
|
|
303
|
+
result = re.sub(r' +', ' ', result)
|
|
304
|
+
|
|
305
|
+
return result.strip()
|
|
306
|
+
|
|
307
|
+
def _parse_segments(self):
|
|
308
|
+
"""Parse RTF content to extract segments."""
|
|
309
|
+
self.segments = []
|
|
310
|
+
|
|
311
|
+
# RTF table structure uses \cell to separate cells and \row to end rows
|
|
312
|
+
# We need to find table rows containing segment data
|
|
313
|
+
|
|
314
|
+
# Split by \row to get table rows
|
|
315
|
+
# But \row appears with various suffixes, so be flexible
|
|
316
|
+
row_pattern = re.compile(r'\\row\b')
|
|
317
|
+
|
|
318
|
+
# Find all content between table cells
|
|
319
|
+
# Pattern: look for 7-digit segment ID followed by cell marker, then source, target, comment
|
|
320
|
+
|
|
321
|
+
# More robust approach: extract text between \cell markers
|
|
322
|
+
# Each row has: ID \cell Source \cell Target \cell Comment \cell
|
|
323
|
+
|
|
324
|
+
# Find the actual table content (between table start and end)
|
|
325
|
+
# The table rows follow the pattern with segment IDs like 0000049
|
|
326
|
+
|
|
327
|
+
segment_pattern = re.compile(
|
|
328
|
+
r'(\d{7})' # Segment ID (7 digits)
|
|
329
|
+
r'[^\\]*\\cell\s*\}' # After ID, find \cell
|
|
330
|
+
r'[^}]*\{[^}]*' # Skip formatting
|
|
331
|
+
r'([^\\]*(?:\\[^c][^\\]*)*)' # Source text (until next \cell)
|
|
332
|
+
r'\\cell\s*' # Cell separator
|
|
333
|
+
r'(.*?)' # Target text (empty or filled)
|
|
334
|
+
r'\\cell\s*' # Cell separator
|
|
335
|
+
r'(.*?)' # Comment
|
|
336
|
+
r'\\cell', # Final cell separator
|
|
337
|
+
re.DOTALL
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
# Simpler approach: find all occurrences of segment IDs followed by cell content
|
|
341
|
+
# Pattern: find 7-digit numbers that look like segment IDs
|
|
342
|
+
|
|
343
|
+
# Look for the actual segment pattern in RTF
|
|
344
|
+
# The content shows: 0000172}...source text...\cell \cell \cell
|
|
345
|
+
# This means: ID, source, empty target, empty comment
|
|
346
|
+
|
|
347
|
+
# Let's use a different approach - find segments by looking for the pattern
|
|
348
|
+
# of 7-digit ID followed by \cell, then text, then \cell \cell \cell
|
|
349
|
+
|
|
350
|
+
# Extract raw cell content using simpler pattern
|
|
351
|
+
# Split by \row to get rows first
|
|
352
|
+
|
|
353
|
+
# Find all text that appears between RTF formatting codes
|
|
354
|
+
# after a 7-digit segment ID
|
|
355
|
+
|
|
356
|
+
# Working pattern based on RTF structure observed:
|
|
357
|
+
# - ID appears as just digits after formatting codes
|
|
358
|
+
# - Then \cell (end of ID cell)
|
|
359
|
+
# - Then source content with embedded formatting and text
|
|
360
|
+
# - Then \cell (end of source cell)
|
|
361
|
+
# - Then \cell (end of empty target cell)
|
|
362
|
+
# - Then \cell (end of empty comment cell)
|
|
363
|
+
|
|
364
|
+
# Simplified extraction: find all 7-digit segment IDs and the text that follows
|
|
365
|
+
current_pos = 0
|
|
366
|
+
rtf = self.raw_rtf
|
|
367
|
+
|
|
368
|
+
# Find segment ID pattern in RTF context
|
|
369
|
+
# Looking for pattern like: ...insrsid9000367 0000172}... (ID followed by })
|
|
370
|
+
id_pattern = re.compile(r'(?:insrsid\d+\s+)(\d{7})\}')
|
|
371
|
+
|
|
372
|
+
for match in id_pattern.finditer(rtf):
|
|
373
|
+
segment_id = match.group(1)
|
|
374
|
+
start_pos = match.end()
|
|
375
|
+
|
|
376
|
+
# Find the source text - it's between the ID and the next \cell markers
|
|
377
|
+
# The pattern after ID is: {formatting}\cell }{formatting}source text}...
|
|
378
|
+
|
|
379
|
+
# Look for text content in the next cell (source cell)
|
|
380
|
+
# Skip to after first \cell (end of ID cell)
|
|
381
|
+
cell_pattern = re.compile(r'\\cell\s*')
|
|
382
|
+
cell_match = cell_pattern.search(rtf, start_pos)
|
|
383
|
+
|
|
384
|
+
if not cell_match:
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
source_start = cell_match.end()
|
|
388
|
+
|
|
389
|
+
# Find the next \cell (end of source cell)
|
|
390
|
+
# But we need to extract the actual text content, not RTF codes
|
|
391
|
+
|
|
392
|
+
# Find the next 3 \cell markers (source, target, comment)
|
|
393
|
+
cells_remaining = 3
|
|
394
|
+
search_pos = source_start
|
|
395
|
+
cell_positions = []
|
|
396
|
+
|
|
397
|
+
for _ in range(cells_remaining):
|
|
398
|
+
cell_match = cell_pattern.search(rtf, search_pos)
|
|
399
|
+
if cell_match:
|
|
400
|
+
cell_positions.append((search_pos, cell_match.start()))
|
|
401
|
+
search_pos = cell_match.end()
|
|
402
|
+
|
|
403
|
+
if len(cell_positions) >= 3:
|
|
404
|
+
# Extract source text from first cell region
|
|
405
|
+
source_region = rtf[cell_positions[0][0]:cell_positions[0][1]]
|
|
406
|
+
source_text = self._extract_text_from_rtf_region(source_region)
|
|
407
|
+
|
|
408
|
+
# Extract target text from second cell region (usually empty)
|
|
409
|
+
target_region = rtf[cell_positions[1][0]:cell_positions[1][1]]
|
|
410
|
+
target_text = self._extract_text_from_rtf_region(target_region)
|
|
411
|
+
|
|
412
|
+
# Extract comment from third cell region
|
|
413
|
+
comment_region = rtf[cell_positions[2][0]:cell_positions[2][1]]
|
|
414
|
+
comment_text = self._extract_text_from_rtf_region(comment_region)
|
|
415
|
+
|
|
416
|
+
if source_text: # Only add if we have source text
|
|
417
|
+
segment = DejaVuSegment(
|
|
418
|
+
segment_id=segment_id,
|
|
419
|
+
source_text=source_text,
|
|
420
|
+
target_text=target_text,
|
|
421
|
+
comment=comment_text,
|
|
422
|
+
row_index=len(self.segments)
|
|
423
|
+
)
|
|
424
|
+
self.segments.append(segment)
|
|
425
|
+
|
|
426
|
+
def _extract_text_from_rtf_region(self, region: str) -> str:
|
|
427
|
+
"""Extract plain text from an RTF region."""
|
|
428
|
+
# Remove nested braces and their contents (formatting groups)
|
|
429
|
+
# but keep the actual text content
|
|
430
|
+
|
|
431
|
+
result = []
|
|
432
|
+
depth = 0
|
|
433
|
+
i = 0
|
|
434
|
+
text_buffer = []
|
|
435
|
+
|
|
436
|
+
while i < len(region):
|
|
437
|
+
char = region[i]
|
|
438
|
+
|
|
439
|
+
if char == '{':
|
|
440
|
+
depth += 1
|
|
441
|
+
elif char == '}':
|
|
442
|
+
depth -= 1
|
|
443
|
+
elif char == '\\':
|
|
444
|
+
# Handle escape sequences
|
|
445
|
+
if i + 1 < len(region):
|
|
446
|
+
next_char = region[i + 1]
|
|
447
|
+
if next_char == '{':
|
|
448
|
+
text_buffer.append('{')
|
|
449
|
+
i += 2
|
|
450
|
+
continue
|
|
451
|
+
elif next_char == '}':
|
|
452
|
+
text_buffer.append('}')
|
|
453
|
+
i += 2
|
|
454
|
+
continue
|
|
455
|
+
elif next_char == '\\':
|
|
456
|
+
text_buffer.append('\\')
|
|
457
|
+
i += 2
|
|
458
|
+
continue
|
|
459
|
+
elif next_char == "'":
|
|
460
|
+
# Hex character code
|
|
461
|
+
if i + 3 < len(region):
|
|
462
|
+
hex_code = region[i:i+4]
|
|
463
|
+
if hex_code in RTF_ESCAPE_MAP:
|
|
464
|
+
text_buffer.append(RTF_ESCAPE_MAP[hex_code])
|
|
465
|
+
i += 4
|
|
466
|
+
continue
|
|
467
|
+
# Skip control word
|
|
468
|
+
j = i + 1
|
|
469
|
+
while j < len(region) and (region[j].isalpha() or region[j].isdigit() or region[j] == '-'):
|
|
470
|
+
j += 1
|
|
471
|
+
if j < len(region) and region[j] == ' ':
|
|
472
|
+
j += 1 # Skip trailing space
|
|
473
|
+
i = j
|
|
474
|
+
continue
|
|
475
|
+
elif depth == 0 or True: # Collect text at any depth
|
|
476
|
+
# Only collect if not whitespace after control word
|
|
477
|
+
if char not in '{}\\\r\n':
|
|
478
|
+
text_buffer.append(char)
|
|
479
|
+
|
|
480
|
+
i += 1
|
|
481
|
+
|
|
482
|
+
text = ''.join(text_buffer)
|
|
483
|
+
|
|
484
|
+
# Clean up: remove excessive whitespace
|
|
485
|
+
text = re.sub(r'\s+', ' ', text)
|
|
486
|
+
text = text.strip()
|
|
487
|
+
|
|
488
|
+
return text
|
|
489
|
+
|
|
490
|
+
def extract_source_segments(self) -> List[DejaVuSegment]:
|
|
491
|
+
"""
|
|
492
|
+
Extract all source segments from the Déjà Vu bilingual RTF.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
list: List of DejaVuSegment objects
|
|
496
|
+
"""
|
|
497
|
+
return self.segments.copy()
|
|
498
|
+
|
|
499
|
+
def get_source_texts(self) -> List[str]:
|
|
500
|
+
"""Get list of source texts for translation."""
|
|
501
|
+
return [seg.source_text for seg in self.segments]
|
|
502
|
+
|
|
503
|
+
def get_target_texts(self) -> List[str]:
|
|
504
|
+
"""Get list of target texts (may be empty)."""
|
|
505
|
+
return [seg.target_text for seg in self.segments]
|
|
506
|
+
|
|
507
|
+
def update_translations(self, translations: Dict[str, str]) -> int:
|
|
508
|
+
"""
|
|
509
|
+
Update target segments with translations.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
translations: Dict mapping segment_id to translated text
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
int: Number of segments updated
|
|
516
|
+
"""
|
|
517
|
+
updated_count = 0
|
|
518
|
+
|
|
519
|
+
for segment in self.segments:
|
|
520
|
+
if segment.segment_id in translations:
|
|
521
|
+
segment.target_text = translations[segment.segment_id]
|
|
522
|
+
updated_count += 1
|
|
523
|
+
|
|
524
|
+
print(f"Updated {updated_count} target segments")
|
|
525
|
+
return updated_count
|
|
526
|
+
|
|
527
|
+
def update_translations_by_index(self, translations: Dict[int, str]) -> int:
|
|
528
|
+
"""
|
|
529
|
+
Update target segments with translations by row index.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
translations: Dict mapping row_index to translated text
|
|
533
|
+
|
|
534
|
+
Returns:
|
|
535
|
+
int: Number of segments updated
|
|
536
|
+
"""
|
|
537
|
+
updated_count = 0
|
|
538
|
+
|
|
539
|
+
for segment in self.segments:
|
|
540
|
+
if segment.row_index in translations:
|
|
541
|
+
segment.target_text = translations[segment.row_index]
|
|
542
|
+
updated_count += 1
|
|
543
|
+
|
|
544
|
+
print(f"Updated {updated_count} target segments by index")
|
|
545
|
+
return updated_count
|
|
546
|
+
|
|
547
|
+
def save(self, output_path: str) -> bool:
|
|
548
|
+
"""
|
|
549
|
+
Save the RTF file with updated translations.
|
|
550
|
+
|
|
551
|
+
This method modifies the RTF by inserting translations into
|
|
552
|
+
the target column cells while preserving the RTF structure.
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
output_path: Path for the output RTF file
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
bool: True if saved successfully, False otherwise
|
|
559
|
+
"""
|
|
560
|
+
try:
|
|
561
|
+
# Create translation map
|
|
562
|
+
translation_map = {seg.segment_id: seg.target_text for seg in self.segments if seg.target_text}
|
|
563
|
+
|
|
564
|
+
if not translation_map:
|
|
565
|
+
print("WARNING: No translations to save")
|
|
566
|
+
# Still save the file as-is
|
|
567
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
568
|
+
f.write(self.raw_rtf)
|
|
569
|
+
return True
|
|
570
|
+
|
|
571
|
+
# Modify RTF to insert translations
|
|
572
|
+
modified_rtf = self._insert_translations(translation_map)
|
|
573
|
+
|
|
574
|
+
# Save modified RTF
|
|
575
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
576
|
+
f.write(modified_rtf)
|
|
577
|
+
|
|
578
|
+
print(f"Saved Déjà Vu RTF to: {output_path}")
|
|
579
|
+
return True
|
|
580
|
+
|
|
581
|
+
except Exception as e:
|
|
582
|
+
print(f"ERROR saving Déjà Vu RTF: {e}")
|
|
583
|
+
import traceback
|
|
584
|
+
traceback.print_exc()
|
|
585
|
+
return False
|
|
586
|
+
|
|
587
|
+
def _encode_text_for_rtf(self, text: str) -> str:
|
|
588
|
+
"""Encode text for RTF format."""
|
|
589
|
+
result = []
|
|
590
|
+
|
|
591
|
+
for char in text:
|
|
592
|
+
code = ord(char)
|
|
593
|
+
if code > 127:
|
|
594
|
+
# Non-ASCII: use Unicode escape
|
|
595
|
+
result.append(f'\\u{code}?')
|
|
596
|
+
elif char == '{':
|
|
597
|
+
result.append('\\{')
|
|
598
|
+
elif char == '}':
|
|
599
|
+
result.append('\\}')
|
|
600
|
+
elif char == '\\':
|
|
601
|
+
result.append('\\\\')
|
|
602
|
+
elif char == '\n':
|
|
603
|
+
result.append('\\par ')
|
|
604
|
+
elif char == '\r':
|
|
605
|
+
pass # Skip carriage returns
|
|
606
|
+
else:
|
|
607
|
+
result.append(char)
|
|
608
|
+
|
|
609
|
+
return ''.join(result)
|
|
610
|
+
|
|
611
|
+
def _insert_translations(self, translations: Dict[str, str]) -> str:
|
|
612
|
+
"""
|
|
613
|
+
Insert translations into the RTF content.
|
|
614
|
+
|
|
615
|
+
This finds the target cell for each segment and inserts the translation.
|
|
616
|
+
The Déjà Vu RTF format has empty target cells that look like:
|
|
617
|
+
\\cell \\cell (two consecutive \\cell markers with nothing between)
|
|
618
|
+
|
|
619
|
+
We insert plain text just before the third \\cell marker.
|
|
620
|
+
"""
|
|
621
|
+
rtf = self.raw_rtf
|
|
622
|
+
|
|
623
|
+
# Pattern to find segment rows by their 7-digit ID
|
|
624
|
+
id_pattern = re.compile(r'(?:insrsid\d+\s+)(\d{7})\}')
|
|
625
|
+
cell_pattern = re.compile(r'\\cell\s*')
|
|
626
|
+
|
|
627
|
+
# Collect modifications (position, replacement_text)
|
|
628
|
+
modifications = []
|
|
629
|
+
|
|
630
|
+
for match in id_pattern.finditer(rtf):
|
|
631
|
+
segment_id = match.group(1)
|
|
632
|
+
|
|
633
|
+
if segment_id not in translations:
|
|
634
|
+
continue
|
|
635
|
+
|
|
636
|
+
translation = translations[segment_id]
|
|
637
|
+
if not translation:
|
|
638
|
+
continue
|
|
639
|
+
|
|
640
|
+
start_pos = match.end()
|
|
641
|
+
|
|
642
|
+
# Find cells after the ID:
|
|
643
|
+
# Cell 1: end of ID cell
|
|
644
|
+
# Cell 2: end of source cell
|
|
645
|
+
# Cell 3: end of target cell (we insert BEFORE this)
|
|
646
|
+
|
|
647
|
+
cell1 = cell_pattern.search(rtf, start_pos)
|
|
648
|
+
if not cell1:
|
|
649
|
+
continue
|
|
650
|
+
|
|
651
|
+
cell2 = cell_pattern.search(rtf, cell1.end())
|
|
652
|
+
if not cell2:
|
|
653
|
+
continue
|
|
654
|
+
|
|
655
|
+
cell3 = cell_pattern.search(rtf, cell2.end())
|
|
656
|
+
if not cell3:
|
|
657
|
+
continue
|
|
658
|
+
|
|
659
|
+
# Insert position is right after cell2 (before cell3)
|
|
660
|
+
insert_pos = cell2.end()
|
|
661
|
+
|
|
662
|
+
# Encode the translation for RTF
|
|
663
|
+
encoded_translation = self._encode_text_for_rtf(translation)
|
|
664
|
+
|
|
665
|
+
# Get target language code for RTF
|
|
666
|
+
target_lang_code = self._get_rtf_lang_code(self.target_lang) or 3082
|
|
667
|
+
|
|
668
|
+
# Build simple RTF-formatted text
|
|
669
|
+
# Format: {formatting}text{} - properly balanced braces
|
|
670
|
+
replacement = (
|
|
671
|
+
f'{{\\rtlch\\fcs1 \\af37 \\ltrch\\fcs0 '
|
|
672
|
+
f'\\f37\\lang{target_lang_code}\\langfe{target_lang_code}'
|
|
673
|
+
f'\\langnp{target_lang_code} {encoded_translation}}}'
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
modifications.append((insert_pos, replacement))
|
|
677
|
+
|
|
678
|
+
# Apply modifications from end to start to preserve positions
|
|
679
|
+
modifications.sort(key=lambda x: x[0], reverse=True)
|
|
680
|
+
|
|
681
|
+
for insert_pos, replacement in modifications:
|
|
682
|
+
rtf = rtf[:insert_pos] + replacement + rtf[insert_pos:]
|
|
683
|
+
|
|
684
|
+
return rtf
|
|
685
|
+
|
|
686
|
+
def _get_rtf_lang_code(self, lang_name: str) -> Optional[int]:
|
|
687
|
+
"""Get RTF language code from language name."""
|
|
688
|
+
for code, name in RTF_LANG_CODES.items():
|
|
689
|
+
if name.lower() == lang_name.lower():
|
|
690
|
+
return code
|
|
691
|
+
return None
|
|
692
|
+
|
|
693
|
+
def get_segment_by_id(self, segment_id: str) -> Optional[DejaVuSegment]:
|
|
694
|
+
"""Get a segment by its ID."""
|
|
695
|
+
for segment in self.segments:
|
|
696
|
+
if segment.segment_id == segment_id:
|
|
697
|
+
return segment
|
|
698
|
+
return None
|
|
699
|
+
|
|
700
|
+
def get_segment_count(self) -> int:
|
|
701
|
+
"""Get the number of segments."""
|
|
702
|
+
return len(self.segments)
|
|
703
|
+
|
|
704
|
+
def has_translations(self) -> bool:
|
|
705
|
+
"""Check if any segments have translations."""
|
|
706
|
+
return any(seg.target_text for seg in self.segments)
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
def extract_dejavu_tags(text: str) -> List[str]:
|
|
710
|
+
"""
|
|
711
|
+
Extract Déjà Vu tag numbers from text.
|
|
712
|
+
|
|
713
|
+
Args:
|
|
714
|
+
text: Text containing Déjà Vu tags
|
|
715
|
+
|
|
716
|
+
Returns:
|
|
717
|
+
List of tag numbers (5-digit strings)
|
|
718
|
+
"""
|
|
719
|
+
return DEJAVU_TAG_PATTERN.findall(text)
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
def strip_dejavu_tags(text: str) -> str:
|
|
723
|
+
"""
|
|
724
|
+
Remove Déjà Vu tags from text.
|
|
725
|
+
|
|
726
|
+
Args:
|
|
727
|
+
text: Text containing Déjà Vu tags
|
|
728
|
+
|
|
729
|
+
Returns:
|
|
730
|
+
Text with tags removed
|
|
731
|
+
"""
|
|
732
|
+
return DEJAVU_TAG_PATTERN.sub('', text).strip()
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def validate_dejavu_tags(source: str, target: str) -> Tuple[bool, List[str]]:
|
|
736
|
+
"""
|
|
737
|
+
Validate that target contains all tags from source.
|
|
738
|
+
|
|
739
|
+
Args:
|
|
740
|
+
source: Source text with tags
|
|
741
|
+
target: Target text that should contain same tags
|
|
742
|
+
|
|
743
|
+
Returns:
|
|
744
|
+
Tuple of (is_valid, list of missing tags)
|
|
745
|
+
"""
|
|
746
|
+
source_tags = set(extract_dejavu_tags(source))
|
|
747
|
+
target_tags = set(extract_dejavu_tags(target))
|
|
748
|
+
|
|
749
|
+
missing = source_tags - target_tags
|
|
750
|
+
extra = target_tags - source_tags
|
|
751
|
+
|
|
752
|
+
is_valid = len(missing) == 0 and len(extra) == 0
|
|
753
|
+
|
|
754
|
+
issues = []
|
|
755
|
+
if missing:
|
|
756
|
+
issues.extend([f"Missing: {{{t}}}" for t in missing])
|
|
757
|
+
if extra:
|
|
758
|
+
issues.extend([f"Extra: {{{t}}}" for t in extra])
|
|
759
|
+
|
|
760
|
+
return is_valid, issues
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
# Test function
|
|
764
|
+
if __name__ == "__main__":
|
|
765
|
+
import sys
|
|
766
|
+
|
|
767
|
+
if len(sys.argv) < 2:
|
|
768
|
+
print("Usage: python dejavurtf_handler.py <path_to_rtf>")
|
|
769
|
+
sys.exit(1)
|
|
770
|
+
|
|
771
|
+
handler = DejaVuRTFHandler()
|
|
772
|
+
if handler.load(sys.argv[1]):
|
|
773
|
+
segments = handler.extract_source_segments()
|
|
774
|
+
print(f"\nExtracted {len(segments)} segments:")
|
|
775
|
+
for i, seg in enumerate(segments[:10]): # Show first 10
|
|
776
|
+
print(f" [{seg.segment_id}] {seg.source_text[:60]}...")
|
|
777
|
+
|
|
778
|
+
if len(segments) > 10:
|
|
779
|
+
print(f" ... and {len(segments) - 10} more segments")
|