supervertaler 1.9.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +47886 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1878 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +333 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1172 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.153.dist-info/METADATA +896 -0
- supervertaler-1.9.153.dist-info/RECORD +85 -0
- supervertaler-1.9.153.dist-info/WHEEL +5 -0
- supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.153.dist-info/top_level.txt +2 -0
modules/docx_handler.py
ADDED
|
@@ -0,0 +1,689 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DOCX Handler
|
|
3
|
+
Import and export DOCX files with formatting preservation
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
from typing import List, Dict, Any
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from docx import Document
|
|
12
|
+
from docx.shared import Pt, RGBColor, Inches
|
|
13
|
+
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
14
|
+
DOCX_AVAILABLE = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
DOCX_AVAILABLE = False
|
|
17
|
+
print("ERROR: python-docx not installed. Run: pip install python-docx")
|
|
18
|
+
|
|
19
|
+
# Import tag manager for inline formatting
|
|
20
|
+
try:
|
|
21
|
+
from .tag_manager import TagManager
|
|
22
|
+
except ImportError:
|
|
23
|
+
try:
|
|
24
|
+
from tag_manager import TagManager
|
|
25
|
+
except ImportError:
|
|
26
|
+
print("WARNING: tag_manager not found. Inline formatting will not be preserved.")
|
|
27
|
+
TagManager = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ParagraphInfo:
|
|
32
|
+
"""Information about a paragraph for reconstruction"""
|
|
33
|
+
text: str
|
|
34
|
+
style: str = None
|
|
35
|
+
alignment: str = None
|
|
36
|
+
paragraph_index: int = 0
|
|
37
|
+
document_position: int = 0 # Position in original document structure
|
|
38
|
+
is_table_cell: bool = False
|
|
39
|
+
table_index: int = None
|
|
40
|
+
row_index: int = None
|
|
41
|
+
cell_index: int = None
|
|
42
|
+
list_type: str = "" # "bullet", "numbered", or ""
|
|
43
|
+
list_number: int = None # For numbered lists
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DOCXHandler:
|
|
47
|
+
"""Handle DOCX import and export operations"""
|
|
48
|
+
|
|
49
|
+
def __init__(self):
|
|
50
|
+
if not DOCX_AVAILABLE:
|
|
51
|
+
raise ImportError("python-docx library is required. Install with: pip install python-docx")
|
|
52
|
+
|
|
53
|
+
self.original_document = None
|
|
54
|
+
self.original_path = None
|
|
55
|
+
self.paragraphs_info: List[ParagraphInfo] = []
|
|
56
|
+
self.tag_manager = TagManager() if TagManager else None
|
|
57
|
+
self._list_type_cache = {} # Cache for numId -> list_type mapping
|
|
58
|
+
|
|
59
|
+
def _get_list_type(self, para) -> tuple:
|
|
60
|
+
"""
|
|
61
|
+
Determine if a paragraph is a bullet or numbered list item.
|
|
62
|
+
Returns: (list_type, list_number) where list_type is "bullet", "numbered", or ""
|
|
63
|
+
"""
|
|
64
|
+
try:
|
|
65
|
+
# Check if paragraph has numbering
|
|
66
|
+
if not hasattr(para._element, 'pPr') or para._element.pPr is None:
|
|
67
|
+
return ("", None)
|
|
68
|
+
|
|
69
|
+
numPr = para._element.pPr.numPr
|
|
70
|
+
if numPr is None:
|
|
71
|
+
return ("", None)
|
|
72
|
+
|
|
73
|
+
# Get numId - the reference to the numbering definition
|
|
74
|
+
numId_elem = numPr.numId
|
|
75
|
+
if numId_elem is None:
|
|
76
|
+
return ("", None)
|
|
77
|
+
|
|
78
|
+
numId = numId_elem.val
|
|
79
|
+
|
|
80
|
+
# Check cache first
|
|
81
|
+
if numId in self._list_type_cache:
|
|
82
|
+
list_type = self._list_type_cache[numId]
|
|
83
|
+
else:
|
|
84
|
+
# Need to look up the numbering definition to determine type
|
|
85
|
+
# Access the numbering part of the document
|
|
86
|
+
list_type = "numbered" # Default assumption
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
numbering_part = self.original_document.part.numbering_part
|
|
90
|
+
if numbering_part is not None:
|
|
91
|
+
# Get the numbering element
|
|
92
|
+
numbering_xml = numbering_part._element
|
|
93
|
+
|
|
94
|
+
# Find the num element with matching numId
|
|
95
|
+
for num in numbering_xml.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}num'):
|
|
96
|
+
if num.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}numId') == str(numId):
|
|
97
|
+
# Get abstractNumId
|
|
98
|
+
abstractNumId_elem = num.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}abstractNumId')
|
|
99
|
+
if abstractNumId_elem is not None:
|
|
100
|
+
abstractNumId = abstractNumId_elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
|
|
101
|
+
|
|
102
|
+
# Find the abstractNum with this ID
|
|
103
|
+
for abstractNum in numbering_xml.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}abstractNum'):
|
|
104
|
+
if abstractNum.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}abstractNumId') == abstractNumId:
|
|
105
|
+
# Check the first level (lvl) for numFmt
|
|
106
|
+
for lvl in abstractNum.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}lvl'):
|
|
107
|
+
numFmt = lvl.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}numFmt')
|
|
108
|
+
if numFmt is not None:
|
|
109
|
+
fmt_val = numFmt.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
|
|
110
|
+
# bullet = bullet point, decimal/upperLetter/lowerLetter/upperRoman/lowerRoman = numbered
|
|
111
|
+
if fmt_val == 'bullet':
|
|
112
|
+
list_type = "bullet"
|
|
113
|
+
else:
|
|
114
|
+
list_type = "numbered"
|
|
115
|
+
break
|
|
116
|
+
break
|
|
117
|
+
break
|
|
118
|
+
except Exception as e:
|
|
119
|
+
# If we can't determine, check the text for bullet characters
|
|
120
|
+
text = para.text.strip() if para.text else ""
|
|
121
|
+
if text.startswith(('•', '·', '○', '■', '□', '►', '-', '*')):
|
|
122
|
+
list_type = "bullet"
|
|
123
|
+
else:
|
|
124
|
+
list_type = "numbered"
|
|
125
|
+
|
|
126
|
+
self._list_type_cache[numId] = list_type
|
|
127
|
+
|
|
128
|
+
# For numbered lists, try to get the actual number
|
|
129
|
+
list_number = None
|
|
130
|
+
if list_type == "numbered":
|
|
131
|
+
# We can't easily get the actual number from python-docx
|
|
132
|
+
# It will be calculated later based on position
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
return (list_type, list_number)
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
# Fallback: check text for bullet characters
|
|
139
|
+
text = para.text.strip() if para.text else ""
|
|
140
|
+
if text.startswith(('•', '·', '○', '■', '□', '►', '-', '*')):
|
|
141
|
+
return ("bullet", None)
|
|
142
|
+
elif text and text[0].isdigit():
|
|
143
|
+
return ("numbered", None)
|
|
144
|
+
return ("", None)
|
|
145
|
+
|
|
146
|
+
def import_docx(self, file_path: str, extract_formatting: bool = True) -> List[str]:
|
|
147
|
+
"""
|
|
148
|
+
Import DOCX file and extract paragraphs with formatting tags
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
file_path: Path to DOCX file
|
|
152
|
+
extract_formatting: If True, convert formatting to inline tags
|
|
153
|
+
|
|
154
|
+
Returns: List of paragraph texts (with tags if extract_formatting=True)
|
|
155
|
+
Includes both regular paragraphs AND table cells
|
|
156
|
+
"""
|
|
157
|
+
if not os.path.exists(file_path):
|
|
158
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
159
|
+
|
|
160
|
+
print(f"[DOCX Handler] Importing: {file_path}")
|
|
161
|
+
if extract_formatting and self.tag_manager:
|
|
162
|
+
print("[DOCX Handler] Extracting inline formatting as tags")
|
|
163
|
+
|
|
164
|
+
# Load document
|
|
165
|
+
self.original_document = Document(file_path)
|
|
166
|
+
self.original_path = file_path
|
|
167
|
+
self.paragraphs_info = []
|
|
168
|
+
|
|
169
|
+
paragraphs = []
|
|
170
|
+
|
|
171
|
+
# Track position in document structure
|
|
172
|
+
para_counter = 0
|
|
173
|
+
doc_position = 0 # Track actual position in document for proper ordering
|
|
174
|
+
|
|
175
|
+
# Build mapping of paragraph objects to their positions for tables
|
|
176
|
+
para_to_table_info = {}
|
|
177
|
+
for table_idx, table in enumerate(self.original_document.tables):
|
|
178
|
+
for row_idx, row in enumerate(table.rows):
|
|
179
|
+
for cell_idx, cell in enumerate(row.cells):
|
|
180
|
+
for para in cell.paragraphs:
|
|
181
|
+
para_to_table_info[id(para)] = (table_idx, row_idx, cell_idx)
|
|
182
|
+
|
|
183
|
+
# Process document elements in order
|
|
184
|
+
# Use document.element.body to get elements in document order
|
|
185
|
+
for elem in self.original_document.element.body:
|
|
186
|
+
# Check if it's a paragraph
|
|
187
|
+
if elem.tag.endswith('}p'):
|
|
188
|
+
# Find corresponding paragraph object
|
|
189
|
+
for para in self.original_document.paragraphs:
|
|
190
|
+
if para._element == elem:
|
|
191
|
+
text = para.text.strip()
|
|
192
|
+
|
|
193
|
+
# Check if this paragraph is inside a table
|
|
194
|
+
if id(para) in para_to_table_info:
|
|
195
|
+
# This paragraph is in a table, skip it here
|
|
196
|
+
# (tables are handled separately below)
|
|
197
|
+
break
|
|
198
|
+
|
|
199
|
+
if text: # Only include non-empty paragraphs
|
|
200
|
+
# Extract formatting if requested
|
|
201
|
+
if extract_formatting and self.tag_manager:
|
|
202
|
+
runs = self.tag_manager.extract_runs(para)
|
|
203
|
+
text_with_tags = self.tag_manager.runs_to_tagged_text(runs)
|
|
204
|
+
|
|
205
|
+
# Check if this is a list item (bullet or numbered)
|
|
206
|
+
list_type, list_number = self._get_list_type(para)
|
|
207
|
+
is_list_item = bool(list_type)
|
|
208
|
+
|
|
209
|
+
# Also detect from text if not detected from XML
|
|
210
|
+
if not is_list_item:
|
|
211
|
+
if text_with_tags.lstrip().startswith(('• ', '· ', '- ', '* ', '○ ', '■ ')):
|
|
212
|
+
is_list_item = True
|
|
213
|
+
list_type = "bullet"
|
|
214
|
+
elif len(text_with_tags) > 2 and text_with_tags[0].isdigit() and text_with_tags[1:3] in ('. ', ') '):
|
|
215
|
+
is_list_item = True
|
|
216
|
+
list_type = "numbered"
|
|
217
|
+
|
|
218
|
+
# Wrap list items in appropriate tag
|
|
219
|
+
# Use <li-b> for bullets, <li-o> for numbered
|
|
220
|
+
if is_list_item:
|
|
221
|
+
if list_type == "bullet":
|
|
222
|
+
text_with_tags = f"<li-b>{text_with_tags}</li-b>"
|
|
223
|
+
else:
|
|
224
|
+
text_with_tags = f"<li-o>{text_with_tags}</li-o>"
|
|
225
|
+
|
|
226
|
+
paragraphs.append(text_with_tags)
|
|
227
|
+
else:
|
|
228
|
+
# Even without formatting extraction, detect list type
|
|
229
|
+
list_type, list_number = self._get_list_type(para)
|
|
230
|
+
paragraphs.append(text)
|
|
231
|
+
|
|
232
|
+
# Store paragraph info for reconstruction
|
|
233
|
+
para_info = ParagraphInfo(
|
|
234
|
+
text=text,
|
|
235
|
+
style=para.style.name if para.style else None,
|
|
236
|
+
alignment=str(para.alignment) if para.alignment else None,
|
|
237
|
+
paragraph_index=para_counter,
|
|
238
|
+
document_position=doc_position,
|
|
239
|
+
is_table_cell=False,
|
|
240
|
+
list_type=list_type,
|
|
241
|
+
list_number=list_number
|
|
242
|
+
)
|
|
243
|
+
self.paragraphs_info.append(para_info)
|
|
244
|
+
para_counter += 1
|
|
245
|
+
|
|
246
|
+
doc_position += 1
|
|
247
|
+
break
|
|
248
|
+
|
|
249
|
+
# Check if it's a table
|
|
250
|
+
elif elem.tag.endswith('}tbl'):
|
|
251
|
+
# Find corresponding table object
|
|
252
|
+
for table_idx, table in enumerate(self.original_document.tables):
|
|
253
|
+
if table._element == elem:
|
|
254
|
+
# Process this table
|
|
255
|
+
for row_idx, row in enumerate(table.rows):
|
|
256
|
+
for cell_idx, cell in enumerate(row.cells):
|
|
257
|
+
# Each cell may contain multiple paragraphs
|
|
258
|
+
for para in cell.paragraphs:
|
|
259
|
+
text = para.text.strip()
|
|
260
|
+
|
|
261
|
+
if text: # Only include non-empty cells
|
|
262
|
+
# Check list type
|
|
263
|
+
list_type, list_number = self._get_list_type(para)
|
|
264
|
+
|
|
265
|
+
# Extract formatting if requested
|
|
266
|
+
if extract_formatting and self.tag_manager:
|
|
267
|
+
runs = self.tag_manager.extract_runs(para)
|
|
268
|
+
text_with_tags = self.tag_manager.runs_to_tagged_text(runs)
|
|
269
|
+
|
|
270
|
+
# Detect from text if not detected from XML
|
|
271
|
+
if not list_type:
|
|
272
|
+
if text_with_tags.lstrip().startswith(('• ', '· ', '- ', '* ', '○ ', '■ ')):
|
|
273
|
+
list_type = "bullet"
|
|
274
|
+
elif len(text_with_tags) > 2 and text_with_tags[0].isdigit() and text_with_tags[1:3] in ('. ', ') '):
|
|
275
|
+
list_type = "numbered"
|
|
276
|
+
|
|
277
|
+
# Wrap in appropriate tag
|
|
278
|
+
if list_type == "bullet":
|
|
279
|
+
text_with_tags = f"<li-b>{text_with_tags}</li-b>"
|
|
280
|
+
elif list_type == "numbered":
|
|
281
|
+
text_with_tags = f"<li-o>{text_with_tags}</li-o>"
|
|
282
|
+
|
|
283
|
+
paragraphs.append(text_with_tags)
|
|
284
|
+
else:
|
|
285
|
+
paragraphs.append(text)
|
|
286
|
+
|
|
287
|
+
# Store table cell info
|
|
288
|
+
para_info = ParagraphInfo(
|
|
289
|
+
text=text,
|
|
290
|
+
style=para.style.name if para.style else None,
|
|
291
|
+
alignment=str(para.alignment) if para.alignment else None,
|
|
292
|
+
paragraph_index=para_counter,
|
|
293
|
+
document_position=doc_position,
|
|
294
|
+
is_table_cell=True,
|
|
295
|
+
table_index=table_idx,
|
|
296
|
+
row_index=row_idx,
|
|
297
|
+
cell_index=cell_idx,
|
|
298
|
+
list_type=list_type,
|
|
299
|
+
list_number=list_number
|
|
300
|
+
)
|
|
301
|
+
self.paragraphs_info.append(para_info)
|
|
302
|
+
para_counter += 1
|
|
303
|
+
|
|
304
|
+
doc_position += 1 # Table counts as one position
|
|
305
|
+
break
|
|
306
|
+
|
|
307
|
+
table_cell_count = sum(1 for p in self.paragraphs_info if p.is_table_cell)
|
|
308
|
+
print(f"[DOCX Handler] Extracted {len(paragraphs)} total items:")
|
|
309
|
+
print(f" - Regular paragraphs: {len(paragraphs) - table_cell_count}")
|
|
310
|
+
print(f" - Table cells: {table_cell_count} (from {len(self.original_document.tables)} tables)")
|
|
311
|
+
return paragraphs
|
|
312
|
+
|
|
313
|
+
def export_docx(self, segments: List[Dict[str, Any]], output_path: str,
|
|
314
|
+
preserve_formatting: bool = True):
|
|
315
|
+
"""
|
|
316
|
+
Export translated segments back to DOCX
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
segments: List of segment dictionaries with 'paragraph_id', 'source', 'target'
|
|
320
|
+
output_path: Path to save the translated document
|
|
321
|
+
preserve_formatting: Whether to preserve original formatting (default True)
|
|
322
|
+
"""
|
|
323
|
+
print(f"[DOCX Handler] Exporting to: {output_path}")
|
|
324
|
+
|
|
325
|
+
if not self.original_document:
|
|
326
|
+
raise ValueError("No original document loaded. Import a DOCX first.")
|
|
327
|
+
|
|
328
|
+
# Create a new document based on the original
|
|
329
|
+
if preserve_formatting and self.original_path:
|
|
330
|
+
# Copy the original document structure
|
|
331
|
+
doc = Document(self.original_path)
|
|
332
|
+
else:
|
|
333
|
+
# Create new blank document
|
|
334
|
+
doc = Document()
|
|
335
|
+
|
|
336
|
+
# Group segments by paragraph index
|
|
337
|
+
para_segments = {}
|
|
338
|
+
for seg in segments:
|
|
339
|
+
para_id = seg.get('paragraph_id', 0)
|
|
340
|
+
if para_id not in para_segments:
|
|
341
|
+
para_segments[para_id] = []
|
|
342
|
+
para_segments[para_id].append(seg)
|
|
343
|
+
|
|
344
|
+
# Track which paragraphs we've processed
|
|
345
|
+
processed_paras = set()
|
|
346
|
+
|
|
347
|
+
print(f"[DOCX Export] Starting export with {len(segments)} segments")
|
|
348
|
+
print(f"[DOCX Export] Paragraph segments grouped into {len(para_segments)} paragraph indices")
|
|
349
|
+
print(f"[DOCX Export] Document has {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables")
|
|
350
|
+
|
|
351
|
+
# Build a mapping of paragraph objects in tables
|
|
352
|
+
table_paras = set()
|
|
353
|
+
for table in doc.tables:
|
|
354
|
+
for row in table.rows:
|
|
355
|
+
for cell in row.cells:
|
|
356
|
+
for para in cell.paragraphs:
|
|
357
|
+
table_paras.add(id(para))
|
|
358
|
+
|
|
359
|
+
print(f"[DOCX Export] Found {len(table_paras)} paragraphs inside tables")
|
|
360
|
+
|
|
361
|
+
# First, process regular paragraphs (excluding those in tables)
|
|
362
|
+
non_empty_para_index = 0
|
|
363
|
+
for para_idx, para in enumerate(doc.paragraphs):
|
|
364
|
+
# Skip paragraphs that are inside tables
|
|
365
|
+
if id(para) in table_paras:
|
|
366
|
+
print(f"[DOCX Export] Skipping doc.paragraphs[{para_idx}] - it's inside a table")
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
# Only process non-empty paragraphs (same logic as import)
|
|
370
|
+
if not para.text.strip():
|
|
371
|
+
print(f"[DOCX Export] Skipping doc.paragraphs[{para_idx}] - empty paragraph")
|
|
372
|
+
continue
|
|
373
|
+
|
|
374
|
+
# Check if this paragraph has corresponding segments
|
|
375
|
+
if non_empty_para_index in para_segments:
|
|
376
|
+
para_info = self._get_para_info(non_empty_para_index)
|
|
377
|
+
|
|
378
|
+
# Double-check it's not a table cell (should already be filtered)
|
|
379
|
+
if para_info and para_info.is_table_cell:
|
|
380
|
+
print(f"[DOCX Export] ERROR: Para {non_empty_para_index} marked as table cell but found in regular paragraphs!")
|
|
381
|
+
non_empty_para_index += 1
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
# Combine all segments from this paragraph
|
|
385
|
+
translations = [s['target'] for s in para_segments[non_empty_para_index]
|
|
386
|
+
if s['target'].strip()]
|
|
387
|
+
|
|
388
|
+
if translations:
|
|
389
|
+
# Join segments back into paragraph (single space, no extra newlines)
|
|
390
|
+
new_text = ' '.join(translations)
|
|
391
|
+
|
|
392
|
+
print(f"[DOCX Export] Para {non_empty_para_index}: Replacing with {len(translations)} segment(s)")
|
|
393
|
+
print(f"[DOCX Export] Original: {para.text[:50]}...")
|
|
394
|
+
print(f"[DOCX Export] New: {new_text[:50]}...")
|
|
395
|
+
|
|
396
|
+
# Replace text while preserving formatting AND style
|
|
397
|
+
self._replace_paragraph_text(para, new_text, para_info.style if para_info else None)
|
|
398
|
+
processed_paras.add(non_empty_para_index)
|
|
399
|
+
else:
|
|
400
|
+
print(f"[DOCX Export] Para {non_empty_para_index}: No translations found")
|
|
401
|
+
else:
|
|
402
|
+
print(f"[DOCX Export] Para {non_empty_para_index}: No segments for this paragraph")
|
|
403
|
+
|
|
404
|
+
non_empty_para_index += 1
|
|
405
|
+
|
|
406
|
+
# Then, process table cells
|
|
407
|
+
print(f"[DOCX Export] Processing {len(doc.tables)} tables...")
|
|
408
|
+
for table_idx, table in enumerate(doc.tables):
|
|
409
|
+
for row_idx, row in enumerate(table.rows):
|
|
410
|
+
for cell_idx, cell in enumerate(row.cells):
|
|
411
|
+
# Each cell may contain multiple paragraphs
|
|
412
|
+
for para in cell.paragraphs:
|
|
413
|
+
if not para.text.strip():
|
|
414
|
+
continue
|
|
415
|
+
|
|
416
|
+
# Find the paragraph info for this table cell
|
|
417
|
+
para_info = self._find_table_cell_info(table_idx, row_idx, cell_idx)
|
|
418
|
+
|
|
419
|
+
if para_info and para_info.paragraph_index in para_segments:
|
|
420
|
+
# Get translations for this cell
|
|
421
|
+
translations = [s['target'] for s in para_segments[para_info.paragraph_index]
|
|
422
|
+
if s['target'].strip()]
|
|
423
|
+
|
|
424
|
+
if translations:
|
|
425
|
+
new_text = ' '.join(translations)
|
|
426
|
+
print(f"[DOCX Export] Table[{table_idx}][{row_idx}][{cell_idx}] Para {para_info.paragraph_index}: Replacing")
|
|
427
|
+
print(f"[DOCX Export] Original: {para.text[:50]}...")
|
|
428
|
+
print(f"[DOCX Export] New: {new_text[:50]}...")
|
|
429
|
+
# Table cells can also have styles - preserve them
|
|
430
|
+
self._replace_paragraph_text(para, new_text, para_info.style)
|
|
431
|
+
processed_paras.add(para_info.paragraph_index)
|
|
432
|
+
else:
|
|
433
|
+
if para_info:
|
|
434
|
+
print(f"[DOCX Export] Table[{table_idx}][{row_idx}][{cell_idx}] Para {para_info.paragraph_index}: No translations")
|
|
435
|
+
else:
|
|
436
|
+
print(f"[DOCX Export] Table[{table_idx}][{row_idx}][{cell_idx}]: No para_info found")
|
|
437
|
+
|
|
438
|
+
# Save the document
|
|
439
|
+
doc.save(output_path)
|
|
440
|
+
print(f"[DOCX Handler] Export complete: {output_path}")
|
|
441
|
+
print(f"[DOCX Handler] Translated {len(processed_paras)} items (paragraphs + table cells)")
|
|
442
|
+
|
|
443
|
+
def _get_para_info(self, paragraph_index: int):
|
|
444
|
+
"""Get ParagraphInfo by paragraph index"""
|
|
445
|
+
for info in self.paragraphs_info:
|
|
446
|
+
if info.paragraph_index == paragraph_index:
|
|
447
|
+
return info
|
|
448
|
+
return None
|
|
449
|
+
|
|
450
|
+
def _find_table_cell_info(self, table_idx: int, row_idx: int, cell_idx: int):
|
|
451
|
+
"""Find ParagraphInfo for a specific table cell"""
|
|
452
|
+
for info in self.paragraphs_info:
|
|
453
|
+
if (info.is_table_cell and
|
|
454
|
+
info.table_index == table_idx and
|
|
455
|
+
info.row_index == row_idx and
|
|
456
|
+
info.cell_index == cell_idx):
|
|
457
|
+
return info
|
|
458
|
+
return None
|
|
459
|
+
|
|
460
|
+
def _replace_paragraph_text(self, paragraph, new_text: str, original_style: str = None):
|
|
461
|
+
"""
|
|
462
|
+
Replace paragraph text while preserving or applying formatting
|
|
463
|
+
|
|
464
|
+
If new_text contains inline tags (e.g., <b>text</b>), they will be
|
|
465
|
+
converted to proper formatting runs.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
paragraph: The paragraph object to modify
|
|
469
|
+
new_text: The new text content
|
|
470
|
+
original_style: Optional original style name to preserve
|
|
471
|
+
"""
|
|
472
|
+
import re
|
|
473
|
+
|
|
474
|
+
# First, strip list item tags - these represent list structure (already preserved in paragraph style)
|
|
475
|
+
# and should NOT appear in the output text
|
|
476
|
+
new_text = re.sub(r'</?li-[ob]>', '', new_text)
|
|
477
|
+
|
|
478
|
+
# Check if text contains formatting tags
|
|
479
|
+
if self.tag_manager and ('<b>' in new_text or '<i>' in new_text or '<u>' in new_text or '<bi>' in new_text or '<sub>' in new_text or '<sup>' in new_text):
|
|
480
|
+
self._replace_paragraph_with_formatting(paragraph, new_text, original_style)
|
|
481
|
+
return
|
|
482
|
+
|
|
483
|
+
# Simple replacement (no tags) - preserve original formatting
|
|
484
|
+
# Store original formatting from first run (if any)
|
|
485
|
+
original_font_name = None
|
|
486
|
+
original_font_size = None
|
|
487
|
+
original_bold = False
|
|
488
|
+
original_italic = False
|
|
489
|
+
|
|
490
|
+
if paragraph.runs:
|
|
491
|
+
first_run = paragraph.runs[0]
|
|
492
|
+
if first_run.font:
|
|
493
|
+
original_font_name = first_run.font.name
|
|
494
|
+
original_font_size = first_run.font.size
|
|
495
|
+
original_bold = first_run.font.bold or False
|
|
496
|
+
original_italic = first_run.font.italic or False
|
|
497
|
+
|
|
498
|
+
# Clear paragraph - delete all runs except first
|
|
499
|
+
while len(paragraph.runs) > 1:
|
|
500
|
+
paragraph._element.remove(paragraph.runs[-1]._element)
|
|
501
|
+
|
|
502
|
+
# If no runs exist, create one
|
|
503
|
+
if not paragraph.runs:
|
|
504
|
+
run = paragraph.add_run()
|
|
505
|
+
else:
|
|
506
|
+
run = paragraph.runs[0]
|
|
507
|
+
|
|
508
|
+
# Set the new text (strip any trailing/leading whitespace to avoid extra newlines)
|
|
509
|
+
run.text = new_text.strip()
|
|
510
|
+
|
|
511
|
+
# Restore run-level formatting
|
|
512
|
+
if original_font_name:
|
|
513
|
+
run.font.name = original_font_name
|
|
514
|
+
if original_font_size:
|
|
515
|
+
run.font.size = original_font_size
|
|
516
|
+
if original_bold:
|
|
517
|
+
run.font.bold = True
|
|
518
|
+
if original_italic:
|
|
519
|
+
run.font.italic = True
|
|
520
|
+
|
|
521
|
+
# Preserve paragraph style if provided
|
|
522
|
+
if original_style:
|
|
523
|
+
try:
|
|
524
|
+
paragraph.style = original_style
|
|
525
|
+
except KeyError:
|
|
526
|
+
# Style doesn't exist in document - keep original
|
|
527
|
+
print(f"[DOCX Handler] Warning: Style '{original_style}' not found, keeping original style")
|
|
528
|
+
pass
|
|
529
|
+
|
|
530
|
+
def _replace_paragraph_with_formatting(self, paragraph, tagged_text: str, original_style: str = None):
|
|
531
|
+
"""
|
|
532
|
+
Replace paragraph text with formatted runs based on inline tags
|
|
533
|
+
|
|
534
|
+
Example: "Hello <b>world</b>!" creates runs with proper bold formatting
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
paragraph: The paragraph object to modify
|
|
538
|
+
tagged_text: Text with inline formatting tags
|
|
539
|
+
original_style: Optional original style name to preserve
|
|
540
|
+
"""
|
|
541
|
+
import re
|
|
542
|
+
|
|
543
|
+
# First, strip list item tags - these represent list structure (already preserved in paragraph style)
|
|
544
|
+
tagged_text = re.sub(r'</?li-[ob]>', '', tagged_text)
|
|
545
|
+
|
|
546
|
+
if not self.tag_manager:
|
|
547
|
+
# Fallback: strip tags and use simple replacement
|
|
548
|
+
clean_text = tagged_text.replace('<b>', '').replace('</b>', '')
|
|
549
|
+
clean_text = clean_text.replace('<i>', '').replace('</i>', '')
|
|
550
|
+
clean_text = clean_text.replace('<u>', '').replace('</u>', '')
|
|
551
|
+
clean_text = clean_text.replace('<bi>', '').replace('</bi>', '')
|
|
552
|
+
clean_text = clean_text.replace('<sub>', '').replace('</sub>', '')
|
|
553
|
+
clean_text = clean_text.replace('<sup>', '').replace('</sup>', '')
|
|
554
|
+
self._replace_paragraph_text(paragraph, clean_text, original_style)
|
|
555
|
+
return
|
|
556
|
+
|
|
557
|
+
# Store original font properties AND colors from all runs
|
|
558
|
+
original_font_name = None
|
|
559
|
+
original_font_size = None
|
|
560
|
+
original_run_colors = {} # Map text -> color for color preservation
|
|
561
|
+
|
|
562
|
+
if paragraph.runs:
|
|
563
|
+
first_run = paragraph.runs[0]
|
|
564
|
+
if first_run.font:
|
|
565
|
+
original_font_name = first_run.font.name
|
|
566
|
+
original_font_size = first_run.font.size
|
|
567
|
+
|
|
568
|
+
# Capture colors from all original runs (for text matching)
|
|
569
|
+
for run in paragraph.runs:
|
|
570
|
+
if run.text and run.font and run.font.color and run.font.color.rgb:
|
|
571
|
+
# Store the color for this text (stripped of whitespace for matching)
|
|
572
|
+
original_run_colors[run.text.strip()] = run.font.color.rgb
|
|
573
|
+
|
|
574
|
+
# Clear all runs
|
|
575
|
+
for run in paragraph.runs:
|
|
576
|
+
paragraph._element.remove(run._element)
|
|
577
|
+
|
|
578
|
+
# Convert tagged text to run specifications
|
|
579
|
+
run_specs = self.tag_manager.tagged_text_to_runs(tagged_text)
|
|
580
|
+
|
|
581
|
+
# Create runs with proper formatting
|
|
582
|
+
for spec in run_specs:
|
|
583
|
+
run = paragraph.add_run(spec['text'])
|
|
584
|
+
|
|
585
|
+
# Apply formatting
|
|
586
|
+
if spec.get('bold'):
|
|
587
|
+
run.font.bold = True
|
|
588
|
+
if spec.get('italic'):
|
|
589
|
+
run.font.italic = True
|
|
590
|
+
if spec.get('underline'):
|
|
591
|
+
run.font.underline = True
|
|
592
|
+
if spec.get('subscript'):
|
|
593
|
+
run.font.subscript = True
|
|
594
|
+
if spec.get('superscript'):
|
|
595
|
+
run.font.superscript = True
|
|
596
|
+
|
|
597
|
+
# Restore original font properties
|
|
598
|
+
if original_font_name:
|
|
599
|
+
run.font.name = original_font_name
|
|
600
|
+
if original_font_size:
|
|
601
|
+
run.font.size = original_font_size
|
|
602
|
+
|
|
603
|
+
# Try to restore original color if this text matches an original run
|
|
604
|
+
text_stripped = spec['text'].strip()
|
|
605
|
+
if text_stripped in original_run_colors:
|
|
606
|
+
run.font.color.rgb = original_run_colors[text_stripped]
|
|
607
|
+
|
|
608
|
+
# Preserve paragraph style if provided
|
|
609
|
+
if original_style:
|
|
610
|
+
try:
|
|
611
|
+
paragraph.style = original_style
|
|
612
|
+
except KeyError:
|
|
613
|
+
# Style doesn't exist in document - keep original
|
|
614
|
+
print(f"[DOCX Handler] Warning: Style '{original_style}' not found, keeping original style")
|
|
615
|
+
pass
|
|
616
|
+
|
|
617
|
+
def export_bilingual_docx(self, segments: List[Dict[str, Any]], output_path: str):
|
|
618
|
+
"""
|
|
619
|
+
Export as bilingual document (source | target in table)
|
|
620
|
+
Useful for review purposes
|
|
621
|
+
"""
|
|
622
|
+
import re
|
|
623
|
+
|
|
624
|
+
def strip_tags(text: str) -> str:
|
|
625
|
+
"""Remove formatting tags from text for clean display."""
|
|
626
|
+
if not text:
|
|
627
|
+
return ""
|
|
628
|
+
text = re.sub(r'</?b>', '', text)
|
|
629
|
+
text = re.sub(r'</?i>', '', text)
|
|
630
|
+
text = re.sub(r'</?u>', '', text)
|
|
631
|
+
text = re.sub(r'</?bi>', '', text)
|
|
632
|
+
text = re.sub(r'</?li-[ob]>', '', text)
|
|
633
|
+
return text
|
|
634
|
+
|
|
635
|
+
print(f"[DOCX Handler] Exporting bilingual document: {output_path}")
|
|
636
|
+
|
|
637
|
+
doc = Document()
|
|
638
|
+
doc.add_heading('Bilingual Translation Document', 0)
|
|
639
|
+
|
|
640
|
+
# Create table
|
|
641
|
+
table = doc.add_table(rows=1, cols=3)
|
|
642
|
+
table.style = 'Light Grid Accent 1'
|
|
643
|
+
|
|
644
|
+
# Header row
|
|
645
|
+
header_cells = table.rows[0].cells
|
|
646
|
+
header_cells[0].text = '#'
|
|
647
|
+
header_cells[1].text = 'Source'
|
|
648
|
+
header_cells[2].text = 'Target'
|
|
649
|
+
|
|
650
|
+
# Add segments - strip tags for clean display
|
|
651
|
+
for seg in segments:
|
|
652
|
+
row_cells = table.add_row().cells
|
|
653
|
+
row_cells[0].text = str(seg.get('id', ''))
|
|
654
|
+
row_cells[1].text = strip_tags(seg.get('source', ''))
|
|
655
|
+
row_cells[2].text = strip_tags(seg.get('target', ''))
|
|
656
|
+
|
|
657
|
+
doc.save(output_path)
|
|
658
|
+
print(f"[DOCX Handler] Bilingual export complete")
|
|
659
|
+
|
|
660
|
+
def get_document_info(self) -> Dict[str, Any]:
|
|
661
|
+
"""Get information about the loaded document"""
|
|
662
|
+
if not self.original_document:
|
|
663
|
+
return {}
|
|
664
|
+
|
|
665
|
+
# Count table cells
|
|
666
|
+
table_cells = sum(1 for info in self.paragraphs_info if info.is_table_cell)
|
|
667
|
+
regular_paras = sum(1 for info in self.paragraphs_info if not info.is_table_cell)
|
|
668
|
+
|
|
669
|
+
return {
|
|
670
|
+
'paragraphs': len(self.original_document.paragraphs),
|
|
671
|
+
'sections': len(self.original_document.sections),
|
|
672
|
+
'tables': len(self.original_document.tables),
|
|
673
|
+
'table_cells': table_cells,
|
|
674
|
+
'regular_paragraphs': regular_paras,
|
|
675
|
+
'total_items': len(self.paragraphs_info),
|
|
676
|
+
'path': self.original_path
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
# Quick test
|
|
681
|
+
if __name__ == "__main__":
|
|
682
|
+
print("DOCX Handler Test")
|
|
683
|
+
print("To test, you need a sample DOCX file.")
|
|
684
|
+
|
|
685
|
+
if DOCX_AVAILABLE:
|
|
686
|
+
print("✓ python-docx is installed")
|
|
687
|
+
else:
|
|
688
|
+
print("✗ python-docx is NOT installed")
|
|
689
|
+
print(" Run: pip install python-docx")
|