supervertaler 1.9.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +47886 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1878 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +333 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1172 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.153.dist-info/METADATA +896 -0
- supervertaler-1.9.153.dist-info/RECORD +85 -0
- supervertaler-1.9.153.dist-info/WHEEL +5 -0
- supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.153.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Phrase (Memsource) Bilingual DOCX Handler
|
|
3
|
+
|
|
4
|
+
This module handles the import and export of Phrase (formerly Memsource) bilingual DOCX files.
|
|
5
|
+
Phrase uses a multi-table format with numbered inline tags.
|
|
6
|
+
|
|
7
|
+
Format Structure:
|
|
8
|
+
- Multiple tables (typically 2 content tables + 3 metadata tables)
|
|
9
|
+
- Content tables with 7 columns:
|
|
10
|
+
1. Segment ID (locked, gray D9D9D9)
|
|
11
|
+
2. Empty (locked, gray D9D9D9)
|
|
12
|
+
3. Segment number (locked, gray D9D9D9)
|
|
13
|
+
4. Source text with tags (locked, gray D9D9D9)
|
|
14
|
+
5. Target text with tags (EDITABLE, no shading)
|
|
15
|
+
6. Status code (locked, colored: 774306=99/confirmed, 5B37C3=MT, etc.)
|
|
16
|
+
7. Empty (no shading)
|
|
17
|
+
|
|
18
|
+
Tag System:
|
|
19
|
+
- Simple tags: {N} (e.g., {1}, {2})
|
|
20
|
+
- Formatting tags: {N>text<N} (e.g., {1>CAUTION<1})
|
|
21
|
+
- Empty formatting: {N><N}
|
|
22
|
+
- Closing tag variant: <N}
|
|
23
|
+
- Special content: {N>� <N} (non-breaking space), {N>on page N<N} (cross-ref)
|
|
24
|
+
|
|
25
|
+
Critical for re-import:
|
|
26
|
+
- Only Column 5 (target text) should be edited
|
|
27
|
+
- All other columns must remain unchanged
|
|
28
|
+
- Tags must be preserved in the target
|
|
29
|
+
- Cell shading/locking must be maintained
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
import os
|
|
33
|
+
import re
|
|
34
|
+
import zipfile
|
|
35
|
+
import xml.etree.ElementTree as ET
|
|
36
|
+
from docx import Document
|
|
37
|
+
from docx.shared import RGBColor, Pt
|
|
38
|
+
from docx.oxml.ns import qn
|
|
39
|
+
from lxml import etree
|
|
40
|
+
from typing import List, Dict, Tuple, Optional
|
|
41
|
+
from copy import deepcopy
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class PhraseSegment:
|
|
45
|
+
"""
|
|
46
|
+
Represents a Phrase segment with tag information.
|
|
47
|
+
"""
|
|
48
|
+
def __init__(self, segment_id: str, segment_num: str, source_text: str,
|
|
49
|
+
target_text: str = "", status_code: str = "",
|
|
50
|
+
row_index: int = 0, table_index: int = 0):
|
|
51
|
+
self.segment_id = segment_id
|
|
52
|
+
self.segment_num = segment_num
|
|
53
|
+
self.source_text = source_text # Plain text with tags as text
|
|
54
|
+
self.target_text = target_text
|
|
55
|
+
self.status_code = status_code
|
|
56
|
+
self.row_index = row_index
|
|
57
|
+
self.table_index = table_index
|
|
58
|
+
|
|
59
|
+
# Extract tags from source for validation
|
|
60
|
+
self.source_tags = self._extract_tags(source_text)
|
|
61
|
+
|
|
62
|
+
def _extract_tags(self, text: str) -> List[str]:
|
|
63
|
+
"""Extract all Phrase tag numbers from text."""
|
|
64
|
+
# Match {N}, {N>...<N}, <N}, {N><N}
|
|
65
|
+
pattern = r'\{(\d+)[>}]|<(\d+)\}'
|
|
66
|
+
matches = re.findall(pattern, text)
|
|
67
|
+
# Flatten tuples and remove empty strings
|
|
68
|
+
return [m for group in matches for m in group if m]
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def plain_source(self) -> str:
|
|
72
|
+
"""Get source text without tags for translation."""
|
|
73
|
+
# Remove all Phrase tag patterns
|
|
74
|
+
text = re.sub(r'\{\d+\}', '', self.source_text) # {N}
|
|
75
|
+
text = re.sub(r'\{\d+>.*?<\d+\}', '', text) # {N>...<N}
|
|
76
|
+
text = re.sub(r'<\d+\}', '', text) # <N}
|
|
77
|
+
text = re.sub(r'\{\d+><\d+\}', '', text) # {N><N}
|
|
78
|
+
return text.strip()
|
|
79
|
+
|
|
80
|
+
def __repr__(self):
|
|
81
|
+
return f"PhraseSegment(id={self.segment_id[:20]}..., num={self.segment_num}, status={self.status_code})"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class PhraseDOCXHandler:
|
|
85
|
+
"""
|
|
86
|
+
Handler for Phrase (Memsource) bilingual DOCX files.
|
|
87
|
+
|
|
88
|
+
This class provides methods to:
|
|
89
|
+
- Load and parse Phrase bilingual DOCX files
|
|
90
|
+
- Extract source segments with tag markers
|
|
91
|
+
- Update target segments with translations (preserving exact structure)
|
|
92
|
+
- Save modified files ready for re-import to Phrase
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
# Phrase tag patterns
|
|
96
|
+
TAG_SIMPLE = re.compile(r'\{\d+\}') # {1}
|
|
97
|
+
TAG_FORMATTED = re.compile(r'\{\d+>.*?<\d+\}') # {1>text<1}
|
|
98
|
+
TAG_CLOSING = re.compile(r'<\d+\}') # <1}
|
|
99
|
+
TAG_EMPTY = re.compile(r'\{\d+><\d+\}') # {1><1}
|
|
100
|
+
TAG_ALL = re.compile(r'\{\d+(?:>.*?<\d+)?\}|<\d+\}') # All patterns
|
|
101
|
+
|
|
102
|
+
def __init__(self):
|
|
103
|
+
self.doc = None
|
|
104
|
+
self.content_tables = [] # List of (table_obj, table_index) tuples
|
|
105
|
+
self.segments: List[PhraseSegment] = []
|
|
106
|
+
self.file_path = None
|
|
107
|
+
|
|
108
|
+
def load(self, file_path: str) -> bool:
|
|
109
|
+
"""
|
|
110
|
+
Load a Phrase bilingual DOCX file.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
file_path: Path to the Phrase bilingual DOCX file
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
bool: True if loaded successfully, False otherwise
|
|
117
|
+
"""
|
|
118
|
+
try:
|
|
119
|
+
self.file_path = file_path
|
|
120
|
+
self.doc = Document(file_path)
|
|
121
|
+
|
|
122
|
+
if len(self.doc.tables) == 0:
|
|
123
|
+
print(f"ERROR: No tables found in {file_path}")
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
# Find content tables (tables with many rows and 7-8 columns)
|
|
127
|
+
self.content_tables = []
|
|
128
|
+
for idx, table in enumerate(self.doc.tables):
|
|
129
|
+
rows = table.rows
|
|
130
|
+
if len(rows) > 100 and len(rows[0].cells) >= 7:
|
|
131
|
+
# Check if first cell looks like a Phrase segment ID
|
|
132
|
+
first_cell = rows[0].cells[0].text.strip()
|
|
133
|
+
if ':' in first_cell: # Segment IDs have format "xxx:nnn"
|
|
134
|
+
self.content_tables.append((table, idx))
|
|
135
|
+
print(f"Found content table {idx} with {len(rows)} rows, {len(rows[0].cells)} columns")
|
|
136
|
+
|
|
137
|
+
if not self.content_tables:
|
|
138
|
+
print(f"ERROR: No Phrase content tables found")
|
|
139
|
+
return False
|
|
140
|
+
|
|
141
|
+
print(f"Successfully loaded Phrase bilingual DOCX: {file_path}")
|
|
142
|
+
print(f"Content tables: {len(self.content_tables)}")
|
|
143
|
+
print(f"Total segments: {sum(len(t[0].rows) for t in self.content_tables)}")
|
|
144
|
+
|
|
145
|
+
return True
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
print(f"ERROR loading Phrase DOCX: {e}")
|
|
149
|
+
import traceback
|
|
150
|
+
traceback.print_exc()
|
|
151
|
+
return False
|
|
152
|
+
|
|
153
|
+
def extract_source_segments(self) -> List[PhraseSegment]:
|
|
154
|
+
"""
|
|
155
|
+
Extract all source segments from the Phrase bilingual DOCX.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
list: List of PhraseSegment objects
|
|
159
|
+
"""
|
|
160
|
+
self.segments = []
|
|
161
|
+
|
|
162
|
+
if not self.content_tables:
|
|
163
|
+
print("ERROR: No content tables loaded")
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
# Process each content table
|
|
167
|
+
for table_obj, table_idx in self.content_tables:
|
|
168
|
+
for row_idx, row in enumerate(table_obj.rows):
|
|
169
|
+
try:
|
|
170
|
+
cells = row.cells
|
|
171
|
+
|
|
172
|
+
# Extract data from columns
|
|
173
|
+
segment_id = cells[0].text.strip()
|
|
174
|
+
# Column 1 is empty
|
|
175
|
+
segment_num = cells[2].text.strip()
|
|
176
|
+
|
|
177
|
+
# Extract source and target with formatting as HTML tags
|
|
178
|
+
source_cell = cells[3]
|
|
179
|
+
target_cell = cells[4]
|
|
180
|
+
source_text = self._cell_to_tagged_text(source_cell)
|
|
181
|
+
target_text = self._cell_to_tagged_text(target_cell)
|
|
182
|
+
|
|
183
|
+
status_code = cells[5].text.strip()
|
|
184
|
+
# Column 6 is empty
|
|
185
|
+
|
|
186
|
+
# Create PhraseSegment
|
|
187
|
+
segment = PhraseSegment(
|
|
188
|
+
segment_id=segment_id,
|
|
189
|
+
segment_num=segment_num,
|
|
190
|
+
source_text=source_text,
|
|
191
|
+
target_text=target_text,
|
|
192
|
+
status_code=status_code,
|
|
193
|
+
row_index=row_idx,
|
|
194
|
+
table_index=table_idx
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
self.segments.append(segment)
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
print(f"WARNING: Error processing row {row_idx} in table {table_idx}: {e}")
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
print(f"Extracted {len(self.segments)} segments from Phrase DOCX")
|
|
204
|
+
return self.segments
|
|
205
|
+
|
|
206
|
+
def update_target_segments(self, translations: Dict[str, str]) -> int:
|
|
207
|
+
"""
|
|
208
|
+
Update target segments with translations.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
translations: Dict mapping segment_id to translated text (with Phrase tags)
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
int: Number of segments updated
|
|
215
|
+
"""
|
|
216
|
+
updated_count = 0
|
|
217
|
+
|
|
218
|
+
# Build a lookup map: segment_id -> (table_obj, row_idx)
|
|
219
|
+
segment_map = {}
|
|
220
|
+
for table_obj, table_idx in self.content_tables:
|
|
221
|
+
for row_idx, row in enumerate(table_obj.rows):
|
|
222
|
+
segment_id = row.cells[0].text.strip()
|
|
223
|
+
segment_map[segment_id] = (table_obj, row_idx)
|
|
224
|
+
|
|
225
|
+
# Update translations
|
|
226
|
+
for segment_id, translation in translations.items():
|
|
227
|
+
if segment_id in segment_map:
|
|
228
|
+
table_obj, row_idx = segment_map[segment_id]
|
|
229
|
+
row = table_obj.rows[row_idx]
|
|
230
|
+
source_cell = row.cells[3] # Column 4 (source)
|
|
231
|
+
target_cell = row.cells[4] # Column 5 (target)
|
|
232
|
+
|
|
233
|
+
# Clear existing target content
|
|
234
|
+
self._clear_cell(target_cell)
|
|
235
|
+
|
|
236
|
+
# Write new translation copying formatting from source
|
|
237
|
+
self._set_cell_text_with_source_formatting(target_cell, translation, source_cell)
|
|
238
|
+
|
|
239
|
+
updated_count += 1
|
|
240
|
+
|
|
241
|
+
print(f"Updated {updated_count} target segments")
|
|
242
|
+
return updated_count
|
|
243
|
+
|
|
244
|
+
def _clear_cell(self, cell):
|
|
245
|
+
"""Clear all content from a cell."""
|
|
246
|
+
for para in cell.paragraphs:
|
|
247
|
+
for run in list(para.runs):
|
|
248
|
+
run._r.getparent().remove(run._r)
|
|
249
|
+
|
|
250
|
+
def _set_cell_text(self, cell, text: str):
|
|
251
|
+
"""Set cell text, preserving whitespace."""
|
|
252
|
+
if not cell.paragraphs:
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
para = cell.paragraphs[0]
|
|
256
|
+
|
|
257
|
+
# Clear existing runs
|
|
258
|
+
for run in list(para.runs):
|
|
259
|
+
run._r.getparent().remove(run._r)
|
|
260
|
+
|
|
261
|
+
# Add new text with xml:space='preserve' for proper whitespace handling
|
|
262
|
+
if text:
|
|
263
|
+
run = para.add_run(text)
|
|
264
|
+
t_elem = run._r.find(qn('w:t'))
|
|
265
|
+
if t_elem is not None:
|
|
266
|
+
t_elem.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
|
267
|
+
|
|
268
|
+
def _cell_to_tagged_text(self, cell) -> str:
|
|
269
|
+
"""
|
|
270
|
+
Convert cell with formatting to HTML-tagged text.
|
|
271
|
+
Uses the same format as memoQ handler: <b>, <i>, <u> tags.
|
|
272
|
+
"""
|
|
273
|
+
result_parts = []
|
|
274
|
+
|
|
275
|
+
for paragraph in cell.paragraphs:
|
|
276
|
+
for run in paragraph.runs:
|
|
277
|
+
text = run.text
|
|
278
|
+
if not text:
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
# Determine which tags to apply
|
|
282
|
+
is_bold = run.bold == True
|
|
283
|
+
is_italic = run.italic == True
|
|
284
|
+
is_underline = run.underline == True
|
|
285
|
+
|
|
286
|
+
# Build tagged text
|
|
287
|
+
if is_bold or is_italic or is_underline:
|
|
288
|
+
# Open tags (order: bold, italic, underline)
|
|
289
|
+
if is_bold:
|
|
290
|
+
text = f"<b>{text}"
|
|
291
|
+
if is_italic:
|
|
292
|
+
text = f"<i>{text}" if not is_bold else text.replace("<b>", "<b><i>", 1)
|
|
293
|
+
if is_underline:
|
|
294
|
+
if is_bold and is_italic:
|
|
295
|
+
text = text.replace("<b><i>", "<b><i><u>", 1)
|
|
296
|
+
elif is_bold:
|
|
297
|
+
text = text.replace("<b>", "<b><u>", 1)
|
|
298
|
+
elif is_italic:
|
|
299
|
+
text = text.replace("<i>", "<i><u>", 1)
|
|
300
|
+
else:
|
|
301
|
+
text = f"<u>{text}"
|
|
302
|
+
|
|
303
|
+
# Close tags (reverse order: underline, italic, bold)
|
|
304
|
+
if is_underline:
|
|
305
|
+
text = f"{text}</u>"
|
|
306
|
+
if is_italic:
|
|
307
|
+
text = f"{text}</i>"
|
|
308
|
+
if is_bold:
|
|
309
|
+
text = f"{text}</b>"
|
|
310
|
+
|
|
311
|
+
result_parts.append(text)
|
|
312
|
+
|
|
313
|
+
return ''.join(result_parts)
|
|
314
|
+
|
|
315
|
+
def _tagged_text_to_runs(self, text: str) -> list:
|
|
316
|
+
"""
|
|
317
|
+
Parse text with HTML formatting tags and return a list of runs with formatting info.
|
|
318
|
+
Compatible with Supervertaler's memoQ format.
|
|
319
|
+
"""
|
|
320
|
+
import re
|
|
321
|
+
|
|
322
|
+
runs = []
|
|
323
|
+
|
|
324
|
+
# Track current formatting state
|
|
325
|
+
is_bold = False
|
|
326
|
+
is_italic = False
|
|
327
|
+
is_underline = False
|
|
328
|
+
|
|
329
|
+
# Pattern to match opening/closing tags
|
|
330
|
+
tag_pattern = re.compile(r'(</?[biu]>)')
|
|
331
|
+
|
|
332
|
+
# Split text by tags, keeping the tags as delimiters
|
|
333
|
+
parts = tag_pattern.split(text)
|
|
334
|
+
|
|
335
|
+
current_text = ""
|
|
336
|
+
|
|
337
|
+
for part in parts:
|
|
338
|
+
if part == "<b>":
|
|
339
|
+
# Save current run if any
|
|
340
|
+
if current_text:
|
|
341
|
+
runs.append({
|
|
342
|
+
'text': current_text,
|
|
343
|
+
'bold': is_bold,
|
|
344
|
+
'italic': is_italic,
|
|
345
|
+
'underline': is_underline
|
|
346
|
+
})
|
|
347
|
+
current_text = ""
|
|
348
|
+
is_bold = True
|
|
349
|
+
elif part == "</b>":
|
|
350
|
+
# Save current run if any
|
|
351
|
+
if current_text:
|
|
352
|
+
runs.append({
|
|
353
|
+
'text': current_text,
|
|
354
|
+
'bold': is_bold,
|
|
355
|
+
'italic': is_italic,
|
|
356
|
+
'underline': is_underline
|
|
357
|
+
})
|
|
358
|
+
current_text = ""
|
|
359
|
+
is_bold = False
|
|
360
|
+
elif part == "<i>":
|
|
361
|
+
if current_text:
|
|
362
|
+
runs.append({
|
|
363
|
+
'text': current_text,
|
|
364
|
+
'bold': is_bold,
|
|
365
|
+
'italic': is_italic,
|
|
366
|
+
'underline': is_underline
|
|
367
|
+
})
|
|
368
|
+
current_text = ""
|
|
369
|
+
is_italic = True
|
|
370
|
+
elif part == "</i>":
|
|
371
|
+
if current_text:
|
|
372
|
+
runs.append({
|
|
373
|
+
'text': current_text,
|
|
374
|
+
'bold': is_bold,
|
|
375
|
+
'italic': is_italic,
|
|
376
|
+
'underline': is_underline
|
|
377
|
+
})
|
|
378
|
+
current_text = ""
|
|
379
|
+
is_italic = False
|
|
380
|
+
elif part == "<u>":
|
|
381
|
+
if current_text:
|
|
382
|
+
runs.append({
|
|
383
|
+
'text': current_text,
|
|
384
|
+
'bold': is_bold,
|
|
385
|
+
'italic': is_italic,
|
|
386
|
+
'underline': is_underline
|
|
387
|
+
})
|
|
388
|
+
current_text = ""
|
|
389
|
+
is_underline = True
|
|
390
|
+
elif part == "</u>":
|
|
391
|
+
if current_text:
|
|
392
|
+
runs.append({
|
|
393
|
+
'text': current_text,
|
|
394
|
+
'bold': is_bold,
|
|
395
|
+
'italic': is_italic,
|
|
396
|
+
'underline': is_underline
|
|
397
|
+
})
|
|
398
|
+
current_text = ""
|
|
399
|
+
is_underline = False
|
|
400
|
+
else:
|
|
401
|
+
# Regular text
|
|
402
|
+
current_text += part
|
|
403
|
+
|
|
404
|
+
# Don't forget the last run
|
|
405
|
+
if current_text:
|
|
406
|
+
runs.append({
|
|
407
|
+
'text': current_text,
|
|
408
|
+
'bold': is_bold,
|
|
409
|
+
'italic': is_italic,
|
|
410
|
+
'underline': is_underline
|
|
411
|
+
})
|
|
412
|
+
|
|
413
|
+
return runs
|
|
414
|
+
|
|
415
|
+
def _set_cell_text_with_source_formatting(self, target_cell, text: str, source_cell):
|
|
416
|
+
"""
|
|
417
|
+
Set cell text parsing HTML formatting tags.
|
|
418
|
+
This preserves word-level bold, italic, and underline formatting.
|
|
419
|
+
"""
|
|
420
|
+
if not target_cell.paragraphs:
|
|
421
|
+
return
|
|
422
|
+
|
|
423
|
+
para = target_cell.paragraphs[0]
|
|
424
|
+
|
|
425
|
+
# Clear existing runs
|
|
426
|
+
for run in list(para.runs):
|
|
427
|
+
run._r.getparent().remove(run._r)
|
|
428
|
+
|
|
429
|
+
# Parse HTML tags and create runs
|
|
430
|
+
runs = self._tagged_text_to_runs(text)
|
|
431
|
+
|
|
432
|
+
for run_info in runs:
|
|
433
|
+
run_text = run_info.get('text', '')
|
|
434
|
+
if not run_text:
|
|
435
|
+
continue
|
|
436
|
+
|
|
437
|
+
run = para.add_run(run_text)
|
|
438
|
+
|
|
439
|
+
# Set xml:space='preserve'
|
|
440
|
+
t_elem = run._r.find(qn('w:t'))
|
|
441
|
+
if t_elem is not None:
|
|
442
|
+
t_elem.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
|
443
|
+
|
|
444
|
+
# Apply formatting
|
|
445
|
+
if run_info.get('bold'):
|
|
446
|
+
run.bold = True
|
|
447
|
+
if run_info.get('italic'):
|
|
448
|
+
run.italic = True
|
|
449
|
+
if run_info.get('underline'):
|
|
450
|
+
run.underline = True
|
|
451
|
+
|
|
452
|
+
def save(self, output_path: str = None) -> bool:
|
|
453
|
+
"""
|
|
454
|
+
Save the modified document.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
output_path: Path to save to (defaults to original path)
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
bool: True if saved successfully
|
|
461
|
+
"""
|
|
462
|
+
try:
|
|
463
|
+
save_path = output_path or self.file_path
|
|
464
|
+
self.doc.save(save_path)
|
|
465
|
+
print(f"Saved Phrase bilingual DOCX: {save_path}")
|
|
466
|
+
return True
|
|
467
|
+
except Exception as e:
|
|
468
|
+
print(f"ERROR saving Phrase DOCX: {e}")
|
|
469
|
+
import traceback
|
|
470
|
+
traceback.print_exc()
|
|
471
|
+
return False
|
|
472
|
+
|
|
473
|
+
def get_segments_for_translation(self) -> List[Tuple[str, str, str]]:
|
|
474
|
+
"""
|
|
475
|
+
Get segments that need translation.
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
List of (segment_id, source_text, plain_source) tuples
|
|
479
|
+
"""
|
|
480
|
+
result = []
|
|
481
|
+
for seg in self.segments:
|
|
482
|
+
# Include all segments (Phrase doesn't have a clear "Not Translated" status)
|
|
483
|
+
# Users can filter based on status_code if needed
|
|
484
|
+
if not seg.target_text or seg.status_code == "MT":
|
|
485
|
+
result.append((seg.segment_id, seg.source_text, seg.plain_source))
|
|
486
|
+
return result
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def detect_phrase_docx(file_path: str) -> bool:
|
|
490
|
+
"""
|
|
491
|
+
Detect if a DOCX file is a Phrase bilingual file.
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
bool: True if this appears to be a Phrase bilingual DOCX
|
|
495
|
+
"""
|
|
496
|
+
try:
|
|
497
|
+
doc = Document(file_path)
|
|
498
|
+
|
|
499
|
+
if len(doc.tables) < 3:
|
|
500
|
+
return False
|
|
501
|
+
|
|
502
|
+
# Look for content tables with Phrase characteristics:
|
|
503
|
+
# - Many rows (>100)
|
|
504
|
+
# - 7 columns
|
|
505
|
+
# - First cell contains ':' (segment ID format)
|
|
506
|
+
for table in doc.tables:
|
|
507
|
+
if len(table.rows) > 100 and len(table.rows[0].cells) == 7:
|
|
508
|
+
first_cell = table.rows[0].cells[0].text.strip()
|
|
509
|
+
if ':' in first_cell:
|
|
510
|
+
return True
|
|
511
|
+
|
|
512
|
+
return False
|
|
513
|
+
|
|
514
|
+
except Exception as e:
|
|
515
|
+
print(f"Error detecting Phrase DOCX: {e}")
|
|
516
|
+
return False
|