supervertaler 1.9.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

Files changed (85) hide show
  1. Supervertaler.py +47886 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1878 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1172 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.153.dist-info/METADATA +896 -0
  81. supervertaler-1.9.153.dist-info/RECORD +85 -0
  82. supervertaler-1.9.153.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.153.dist-info/top_level.txt +2 -0
@@ -0,0 +1,689 @@
1
+ """
2
+ DOCX Handler
3
+ Import and export DOCX files with formatting preservation
4
+ """
5
+
6
+ import os
7
+ from typing import List, Dict, Any
8
+ from dataclasses import dataclass
9
+
10
+ try:
11
+ from docx import Document
12
+ from docx.shared import Pt, RGBColor, Inches
13
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
14
+ DOCX_AVAILABLE = True
15
+ except ImportError:
16
+ DOCX_AVAILABLE = False
17
+ print("ERROR: python-docx not installed. Run: pip install python-docx")
18
+
19
+ # Import tag manager for inline formatting
20
+ try:
21
+ from .tag_manager import TagManager
22
+ except ImportError:
23
+ try:
24
+ from tag_manager import TagManager
25
+ except ImportError:
26
+ print("WARNING: tag_manager not found. Inline formatting will not be preserved.")
27
+ TagManager = None
28
+
29
+
30
+ @dataclass
31
+ class ParagraphInfo:
32
+ """Information about a paragraph for reconstruction"""
33
+ text: str
34
+ style: str = None
35
+ alignment: str = None
36
+ paragraph_index: int = 0
37
+ document_position: int = 0 # Position in original document structure
38
+ is_table_cell: bool = False
39
+ table_index: int = None
40
+ row_index: int = None
41
+ cell_index: int = None
42
+ list_type: str = "" # "bullet", "numbered", or ""
43
+ list_number: int = None # For numbered lists
44
+
45
+
46
+ class DOCXHandler:
47
+ """Handle DOCX import and export operations"""
48
+
49
+ def __init__(self):
50
+ if not DOCX_AVAILABLE:
51
+ raise ImportError("python-docx library is required. Install with: pip install python-docx")
52
+
53
+ self.original_document = None
54
+ self.original_path = None
55
+ self.paragraphs_info: List[ParagraphInfo] = []
56
+ self.tag_manager = TagManager() if TagManager else None
57
+ self._list_type_cache = {} # Cache for numId -> list_type mapping
58
+
59
+ def _get_list_type(self, para) -> tuple:
60
+ """
61
+ Determine if a paragraph is a bullet or numbered list item.
62
+ Returns: (list_type, list_number) where list_type is "bullet", "numbered", or ""
63
+ """
64
+ try:
65
+ # Check if paragraph has numbering
66
+ if not hasattr(para._element, 'pPr') or para._element.pPr is None:
67
+ return ("", None)
68
+
69
+ numPr = para._element.pPr.numPr
70
+ if numPr is None:
71
+ return ("", None)
72
+
73
+ # Get numId - the reference to the numbering definition
74
+ numId_elem = numPr.numId
75
+ if numId_elem is None:
76
+ return ("", None)
77
+
78
+ numId = numId_elem.val
79
+
80
+ # Check cache first
81
+ if numId in self._list_type_cache:
82
+ list_type = self._list_type_cache[numId]
83
+ else:
84
+ # Need to look up the numbering definition to determine type
85
+ # Access the numbering part of the document
86
+ list_type = "numbered" # Default assumption
87
+
88
+ try:
89
+ numbering_part = self.original_document.part.numbering_part
90
+ if numbering_part is not None:
91
+ # Get the numbering element
92
+ numbering_xml = numbering_part._element
93
+
94
+ # Find the num element with matching numId
95
+ for num in numbering_xml.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}num'):
96
+ if num.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}numId') == str(numId):
97
+ # Get abstractNumId
98
+ abstractNumId_elem = num.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}abstractNumId')
99
+ if abstractNumId_elem is not None:
100
+ abstractNumId = abstractNumId_elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
101
+
102
+ # Find the abstractNum with this ID
103
+ for abstractNum in numbering_xml.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}abstractNum'):
104
+ if abstractNum.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}abstractNumId') == abstractNumId:
105
+ # Check the first level (lvl) for numFmt
106
+ for lvl in abstractNum.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}lvl'):
107
+ numFmt = lvl.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}numFmt')
108
+ if numFmt is not None:
109
+ fmt_val = numFmt.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
110
+ # bullet = bullet point, decimal/upperLetter/lowerLetter/upperRoman/lowerRoman = numbered
111
+ if fmt_val == 'bullet':
112
+ list_type = "bullet"
113
+ else:
114
+ list_type = "numbered"
115
+ break
116
+ break
117
+ break
118
+ except Exception as e:
119
+ # If we can't determine, check the text for bullet characters
120
+ text = para.text.strip() if para.text else ""
121
+ if text.startswith(('•', '·', '○', '■', '□', '►', '-', '*')):
122
+ list_type = "bullet"
123
+ else:
124
+ list_type = "numbered"
125
+
126
+ self._list_type_cache[numId] = list_type
127
+
128
+ # For numbered lists, try to get the actual number
129
+ list_number = None
130
+ if list_type == "numbered":
131
+ # We can't easily get the actual number from python-docx
132
+ # It will be calculated later based on position
133
+ pass
134
+
135
+ return (list_type, list_number)
136
+
137
+ except Exception as e:
138
+ # Fallback: check text for bullet characters
139
+ text = para.text.strip() if para.text else ""
140
+ if text.startswith(('•', '·', '○', '■', '□', '►', '-', '*')):
141
+ return ("bullet", None)
142
+ elif text and text[0].isdigit():
143
+ return ("numbered", None)
144
+ return ("", None)
145
+
146
+ def import_docx(self, file_path: str, extract_formatting: bool = True) -> List[str]:
147
+ """
148
+ Import DOCX file and extract paragraphs with formatting tags
149
+
150
+ Args:
151
+ file_path: Path to DOCX file
152
+ extract_formatting: If True, convert formatting to inline tags
153
+
154
+ Returns: List of paragraph texts (with tags if extract_formatting=True)
155
+ Includes both regular paragraphs AND table cells
156
+ """
157
+ if not os.path.exists(file_path):
158
+ raise FileNotFoundError(f"File not found: {file_path}")
159
+
160
+ print(f"[DOCX Handler] Importing: {file_path}")
161
+ if extract_formatting and self.tag_manager:
162
+ print("[DOCX Handler] Extracting inline formatting as tags")
163
+
164
+ # Load document
165
+ self.original_document = Document(file_path)
166
+ self.original_path = file_path
167
+ self.paragraphs_info = []
168
+
169
+ paragraphs = []
170
+
171
+ # Track position in document structure
172
+ para_counter = 0
173
+ doc_position = 0 # Track actual position in document for proper ordering
174
+
175
+ # Build mapping of paragraph objects to their positions for tables
176
+ para_to_table_info = {}
177
+ for table_idx, table in enumerate(self.original_document.tables):
178
+ for row_idx, row in enumerate(table.rows):
179
+ for cell_idx, cell in enumerate(row.cells):
180
+ for para in cell.paragraphs:
181
+ para_to_table_info[id(para)] = (table_idx, row_idx, cell_idx)
182
+
183
+ # Process document elements in order
184
+ # Use document.element.body to get elements in document order
185
+ for elem in self.original_document.element.body:
186
+ # Check if it's a paragraph
187
+ if elem.tag.endswith('}p'):
188
+ # Find corresponding paragraph object
189
+ for para in self.original_document.paragraphs:
190
+ if para._element == elem:
191
+ text = para.text.strip()
192
+
193
+ # Check if this paragraph is inside a table
194
+ if id(para) in para_to_table_info:
195
+ # This paragraph is in a table, skip it here
196
+ # (tables are handled separately below)
197
+ break
198
+
199
+ if text: # Only include non-empty paragraphs
200
+ # Extract formatting if requested
201
+ if extract_formatting and self.tag_manager:
202
+ runs = self.tag_manager.extract_runs(para)
203
+ text_with_tags = self.tag_manager.runs_to_tagged_text(runs)
204
+
205
+ # Check if this is a list item (bullet or numbered)
206
+ list_type, list_number = self._get_list_type(para)
207
+ is_list_item = bool(list_type)
208
+
209
+ # Also detect from text if not detected from XML
210
+ if not is_list_item:
211
+ if text_with_tags.lstrip().startswith(('• ', '· ', '- ', '* ', '○ ', '■ ')):
212
+ is_list_item = True
213
+ list_type = "bullet"
214
+ elif len(text_with_tags) > 2 and text_with_tags[0].isdigit() and text_with_tags[1:3] in ('. ', ') '):
215
+ is_list_item = True
216
+ list_type = "numbered"
217
+
218
+ # Wrap list items in appropriate tag
219
+ # Use <li-b> for bullets, <li-o> for numbered
220
+ if is_list_item:
221
+ if list_type == "bullet":
222
+ text_with_tags = f"<li-b>{text_with_tags}</li-b>"
223
+ else:
224
+ text_with_tags = f"<li-o>{text_with_tags}</li-o>"
225
+
226
+ paragraphs.append(text_with_tags)
227
+ else:
228
+ # Even without formatting extraction, detect list type
229
+ list_type, list_number = self._get_list_type(para)
230
+ paragraphs.append(text)
231
+
232
+ # Store paragraph info for reconstruction
233
+ para_info = ParagraphInfo(
234
+ text=text,
235
+ style=para.style.name if para.style else None,
236
+ alignment=str(para.alignment) if para.alignment else None,
237
+ paragraph_index=para_counter,
238
+ document_position=doc_position,
239
+ is_table_cell=False,
240
+ list_type=list_type,
241
+ list_number=list_number
242
+ )
243
+ self.paragraphs_info.append(para_info)
244
+ para_counter += 1
245
+
246
+ doc_position += 1
247
+ break
248
+
249
+ # Check if it's a table
250
+ elif elem.tag.endswith('}tbl'):
251
+ # Find corresponding table object
252
+ for table_idx, table in enumerate(self.original_document.tables):
253
+ if table._element == elem:
254
+ # Process this table
255
+ for row_idx, row in enumerate(table.rows):
256
+ for cell_idx, cell in enumerate(row.cells):
257
+ # Each cell may contain multiple paragraphs
258
+ for para in cell.paragraphs:
259
+ text = para.text.strip()
260
+
261
+ if text: # Only include non-empty cells
262
+ # Check list type
263
+ list_type, list_number = self._get_list_type(para)
264
+
265
+ # Extract formatting if requested
266
+ if extract_formatting and self.tag_manager:
267
+ runs = self.tag_manager.extract_runs(para)
268
+ text_with_tags = self.tag_manager.runs_to_tagged_text(runs)
269
+
270
+ # Detect from text if not detected from XML
271
+ if not list_type:
272
+ if text_with_tags.lstrip().startswith(('• ', '· ', '- ', '* ', '○ ', '■ ')):
273
+ list_type = "bullet"
274
+ elif len(text_with_tags) > 2 and text_with_tags[0].isdigit() and text_with_tags[1:3] in ('. ', ') '):
275
+ list_type = "numbered"
276
+
277
+ # Wrap in appropriate tag
278
+ if list_type == "bullet":
279
+ text_with_tags = f"<li-b>{text_with_tags}</li-b>"
280
+ elif list_type == "numbered":
281
+ text_with_tags = f"<li-o>{text_with_tags}</li-o>"
282
+
283
+ paragraphs.append(text_with_tags)
284
+ else:
285
+ paragraphs.append(text)
286
+
287
+ # Store table cell info
288
+ para_info = ParagraphInfo(
289
+ text=text,
290
+ style=para.style.name if para.style else None,
291
+ alignment=str(para.alignment) if para.alignment else None,
292
+ paragraph_index=para_counter,
293
+ document_position=doc_position,
294
+ is_table_cell=True,
295
+ table_index=table_idx,
296
+ row_index=row_idx,
297
+ cell_index=cell_idx,
298
+ list_type=list_type,
299
+ list_number=list_number
300
+ )
301
+ self.paragraphs_info.append(para_info)
302
+ para_counter += 1
303
+
304
+ doc_position += 1 # Table counts as one position
305
+ break
306
+
307
+ table_cell_count = sum(1 for p in self.paragraphs_info if p.is_table_cell)
308
+ print(f"[DOCX Handler] Extracted {len(paragraphs)} total items:")
309
+ print(f" - Regular paragraphs: {len(paragraphs) - table_cell_count}")
310
+ print(f" - Table cells: {table_cell_count} (from {len(self.original_document.tables)} tables)")
311
+ return paragraphs
312
+
313
+ def export_docx(self, segments: List[Dict[str, Any]], output_path: str,
314
+ preserve_formatting: bool = True):
315
+ """
316
+ Export translated segments back to DOCX
317
+
318
+ Args:
319
+ segments: List of segment dictionaries with 'paragraph_id', 'source', 'target'
320
+ output_path: Path to save the translated document
321
+ preserve_formatting: Whether to preserve original formatting (default True)
322
+ """
323
+ print(f"[DOCX Handler] Exporting to: {output_path}")
324
+
325
+ if not self.original_document:
326
+ raise ValueError("No original document loaded. Import a DOCX first.")
327
+
328
+ # Create a new document based on the original
329
+ if preserve_formatting and self.original_path:
330
+ # Copy the original document structure
331
+ doc = Document(self.original_path)
332
+ else:
333
+ # Create new blank document
334
+ doc = Document()
335
+
336
+ # Group segments by paragraph index
337
+ para_segments = {}
338
+ for seg in segments:
339
+ para_id = seg.get('paragraph_id', 0)
340
+ if para_id not in para_segments:
341
+ para_segments[para_id] = []
342
+ para_segments[para_id].append(seg)
343
+
344
+ # Track which paragraphs we've processed
345
+ processed_paras = set()
346
+
347
+ print(f"[DOCX Export] Starting export with {len(segments)} segments")
348
+ print(f"[DOCX Export] Paragraph segments grouped into {len(para_segments)} paragraph indices")
349
+ print(f"[DOCX Export] Document has {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables")
350
+
351
+ # Build a mapping of paragraph objects in tables
352
+ table_paras = set()
353
+ for table in doc.tables:
354
+ for row in table.rows:
355
+ for cell in row.cells:
356
+ for para in cell.paragraphs:
357
+ table_paras.add(id(para))
358
+
359
+ print(f"[DOCX Export] Found {len(table_paras)} paragraphs inside tables")
360
+
361
+ # First, process regular paragraphs (excluding those in tables)
362
+ non_empty_para_index = 0
363
+ for para_idx, para in enumerate(doc.paragraphs):
364
+ # Skip paragraphs that are inside tables
365
+ if id(para) in table_paras:
366
+ print(f"[DOCX Export] Skipping doc.paragraphs[{para_idx}] - it's inside a table")
367
+ continue
368
+
369
+ # Only process non-empty paragraphs (same logic as import)
370
+ if not para.text.strip():
371
+ print(f"[DOCX Export] Skipping doc.paragraphs[{para_idx}] - empty paragraph")
372
+ continue
373
+
374
+ # Check if this paragraph has corresponding segments
375
+ if non_empty_para_index in para_segments:
376
+ para_info = self._get_para_info(non_empty_para_index)
377
+
378
+ # Double-check it's not a table cell (should already be filtered)
379
+ if para_info and para_info.is_table_cell:
380
+ print(f"[DOCX Export] ERROR: Para {non_empty_para_index} marked as table cell but found in regular paragraphs!")
381
+ non_empty_para_index += 1
382
+ continue
383
+
384
+ # Combine all segments from this paragraph
385
+ translations = [s['target'] for s in para_segments[non_empty_para_index]
386
+ if s['target'].strip()]
387
+
388
+ if translations:
389
+ # Join segments back into paragraph (single space, no extra newlines)
390
+ new_text = ' '.join(translations)
391
+
392
+ print(f"[DOCX Export] Para {non_empty_para_index}: Replacing with {len(translations)} segment(s)")
393
+ print(f"[DOCX Export] Original: {para.text[:50]}...")
394
+ print(f"[DOCX Export] New: {new_text[:50]}...")
395
+
396
+ # Replace text while preserving formatting AND style
397
+ self._replace_paragraph_text(para, new_text, para_info.style if para_info else None)
398
+ processed_paras.add(non_empty_para_index)
399
+ else:
400
+ print(f"[DOCX Export] Para {non_empty_para_index}: No translations found")
401
+ else:
402
+ print(f"[DOCX Export] Para {non_empty_para_index}: No segments for this paragraph")
403
+
404
+ non_empty_para_index += 1
405
+
406
+ # Then, process table cells
407
+ print(f"[DOCX Export] Processing {len(doc.tables)} tables...")
408
+ for table_idx, table in enumerate(doc.tables):
409
+ for row_idx, row in enumerate(table.rows):
410
+ for cell_idx, cell in enumerate(row.cells):
411
+ # Each cell may contain multiple paragraphs
412
+ for para in cell.paragraphs:
413
+ if not para.text.strip():
414
+ continue
415
+
416
+ # Find the paragraph info for this table cell
417
+ para_info = self._find_table_cell_info(table_idx, row_idx, cell_idx)
418
+
419
+ if para_info and para_info.paragraph_index in para_segments:
420
+ # Get translations for this cell
421
+ translations = [s['target'] for s in para_segments[para_info.paragraph_index]
422
+ if s['target'].strip()]
423
+
424
+ if translations:
425
+ new_text = ' '.join(translations)
426
+ print(f"[DOCX Export] Table[{table_idx}][{row_idx}][{cell_idx}] Para {para_info.paragraph_index}: Replacing")
427
+ print(f"[DOCX Export] Original: {para.text[:50]}...")
428
+ print(f"[DOCX Export] New: {new_text[:50]}...")
429
+ # Table cells can also have styles - preserve them
430
+ self._replace_paragraph_text(para, new_text, para_info.style)
431
+ processed_paras.add(para_info.paragraph_index)
432
+ else:
433
+ if para_info:
434
+ print(f"[DOCX Export] Table[{table_idx}][{row_idx}][{cell_idx}] Para {para_info.paragraph_index}: No translations")
435
+ else:
436
+ print(f"[DOCX Export] Table[{table_idx}][{row_idx}][{cell_idx}]: No para_info found")
437
+
438
+ # Save the document
439
+ doc.save(output_path)
440
+ print(f"[DOCX Handler] Export complete: {output_path}")
441
+ print(f"[DOCX Handler] Translated {len(processed_paras)} items (paragraphs + table cells)")
442
+
443
+ def _get_para_info(self, paragraph_index: int):
444
+ """Get ParagraphInfo by paragraph index"""
445
+ for info in self.paragraphs_info:
446
+ if info.paragraph_index == paragraph_index:
447
+ return info
448
+ return None
449
+
450
+ def _find_table_cell_info(self, table_idx: int, row_idx: int, cell_idx: int):
451
+ """Find ParagraphInfo for a specific table cell"""
452
+ for info in self.paragraphs_info:
453
+ if (info.is_table_cell and
454
+ info.table_index == table_idx and
455
+ info.row_index == row_idx and
456
+ info.cell_index == cell_idx):
457
+ return info
458
+ return None
459
+
460
+ def _replace_paragraph_text(self, paragraph, new_text: str, original_style: str = None):
461
+ """
462
+ Replace paragraph text while preserving or applying formatting
463
+
464
+ If new_text contains inline tags (e.g., <b>text</b>), they will be
465
+ converted to proper formatting runs.
466
+
467
+ Args:
468
+ paragraph: The paragraph object to modify
469
+ new_text: The new text content
470
+ original_style: Optional original style name to preserve
471
+ """
472
+ import re
473
+
474
+ # First, strip list item tags - these represent list structure (already preserved in paragraph style)
475
+ # and should NOT appear in the output text
476
+ new_text = re.sub(r'</?li-[ob]>', '', new_text)
477
+
478
+ # Check if text contains formatting tags
479
+ if self.tag_manager and ('<b>' in new_text or '<i>' in new_text or '<u>' in new_text or '<bi>' in new_text or '<sub>' in new_text or '<sup>' in new_text):
480
+ self._replace_paragraph_with_formatting(paragraph, new_text, original_style)
481
+ return
482
+
483
+ # Simple replacement (no tags) - preserve original formatting
484
+ # Store original formatting from first run (if any)
485
+ original_font_name = None
486
+ original_font_size = None
487
+ original_bold = False
488
+ original_italic = False
489
+
490
+ if paragraph.runs:
491
+ first_run = paragraph.runs[0]
492
+ if first_run.font:
493
+ original_font_name = first_run.font.name
494
+ original_font_size = first_run.font.size
495
+ original_bold = first_run.font.bold or False
496
+ original_italic = first_run.font.italic or False
497
+
498
+ # Clear paragraph - delete all runs except first
499
+ while len(paragraph.runs) > 1:
500
+ paragraph._element.remove(paragraph.runs[-1]._element)
501
+
502
+ # If no runs exist, create one
503
+ if not paragraph.runs:
504
+ run = paragraph.add_run()
505
+ else:
506
+ run = paragraph.runs[0]
507
+
508
+ # Set the new text (strip any trailing/leading whitespace to avoid extra newlines)
509
+ run.text = new_text.strip()
510
+
511
+ # Restore run-level formatting
512
+ if original_font_name:
513
+ run.font.name = original_font_name
514
+ if original_font_size:
515
+ run.font.size = original_font_size
516
+ if original_bold:
517
+ run.font.bold = True
518
+ if original_italic:
519
+ run.font.italic = True
520
+
521
+ # Preserve paragraph style if provided
522
+ if original_style:
523
+ try:
524
+ paragraph.style = original_style
525
+ except KeyError:
526
+ # Style doesn't exist in document - keep original
527
+ print(f"[DOCX Handler] Warning: Style '{original_style}' not found, keeping original style")
528
+ pass
529
+
530
+ def _replace_paragraph_with_formatting(self, paragraph, tagged_text: str, original_style: str = None):
531
+ """
532
+ Replace paragraph text with formatted runs based on inline tags
533
+
534
+ Example: "Hello <b>world</b>!" creates runs with proper bold formatting
535
+
536
+ Args:
537
+ paragraph: The paragraph object to modify
538
+ tagged_text: Text with inline formatting tags
539
+ original_style: Optional original style name to preserve
540
+ """
541
+ import re
542
+
543
+ # First, strip list item tags - these represent list structure (already preserved in paragraph style)
544
+ tagged_text = re.sub(r'</?li-[ob]>', '', tagged_text)
545
+
546
+ if not self.tag_manager:
547
+ # Fallback: strip tags and use simple replacement
548
+ clean_text = tagged_text.replace('<b>', '').replace('</b>', '')
549
+ clean_text = clean_text.replace('<i>', '').replace('</i>', '')
550
+ clean_text = clean_text.replace('<u>', '').replace('</u>', '')
551
+ clean_text = clean_text.replace('<bi>', '').replace('</bi>', '')
552
+ clean_text = clean_text.replace('<sub>', '').replace('</sub>', '')
553
+ clean_text = clean_text.replace('<sup>', '').replace('</sup>', '')
554
+ self._replace_paragraph_text(paragraph, clean_text, original_style)
555
+ return
556
+
557
+ # Store original font properties AND colors from all runs
558
+ original_font_name = None
559
+ original_font_size = None
560
+ original_run_colors = {} # Map text -> color for color preservation
561
+
562
+ if paragraph.runs:
563
+ first_run = paragraph.runs[0]
564
+ if first_run.font:
565
+ original_font_name = first_run.font.name
566
+ original_font_size = first_run.font.size
567
+
568
+ # Capture colors from all original runs (for text matching)
569
+ for run in paragraph.runs:
570
+ if run.text and run.font and run.font.color and run.font.color.rgb:
571
+ # Store the color for this text (stripped of whitespace for matching)
572
+ original_run_colors[run.text.strip()] = run.font.color.rgb
573
+
574
+ # Clear all runs
575
+ for run in paragraph.runs:
576
+ paragraph._element.remove(run._element)
577
+
578
+ # Convert tagged text to run specifications
579
+ run_specs = self.tag_manager.tagged_text_to_runs(tagged_text)
580
+
581
+ # Create runs with proper formatting
582
+ for spec in run_specs:
583
+ run = paragraph.add_run(spec['text'])
584
+
585
+ # Apply formatting
586
+ if spec.get('bold'):
587
+ run.font.bold = True
588
+ if spec.get('italic'):
589
+ run.font.italic = True
590
+ if spec.get('underline'):
591
+ run.font.underline = True
592
+ if spec.get('subscript'):
593
+ run.font.subscript = True
594
+ if spec.get('superscript'):
595
+ run.font.superscript = True
596
+
597
+ # Restore original font properties
598
+ if original_font_name:
599
+ run.font.name = original_font_name
600
+ if original_font_size:
601
+ run.font.size = original_font_size
602
+
603
+ # Try to restore original color if this text matches an original run
604
+ text_stripped = spec['text'].strip()
605
+ if text_stripped in original_run_colors:
606
+ run.font.color.rgb = original_run_colors[text_stripped]
607
+
608
+ # Preserve paragraph style if provided
609
+ if original_style:
610
+ try:
611
+ paragraph.style = original_style
612
+ except KeyError:
613
+ # Style doesn't exist in document - keep original
614
+ print(f"[DOCX Handler] Warning: Style '{original_style}' not found, keeping original style")
615
+ pass
616
+
617
+ def export_bilingual_docx(self, segments: List[Dict[str, Any]], output_path: str):
618
+ """
619
+ Export as bilingual document (source | target in table)
620
+ Useful for review purposes
621
+ """
622
+ import re
623
+
624
+ def strip_tags(text: str) -> str:
625
+ """Remove formatting tags from text for clean display."""
626
+ if not text:
627
+ return ""
628
+ text = re.sub(r'</?b>', '', text)
629
+ text = re.sub(r'</?i>', '', text)
630
+ text = re.sub(r'</?u>', '', text)
631
+ text = re.sub(r'</?bi>', '', text)
632
+ text = re.sub(r'</?li-[ob]>', '', text)
633
+ return text
634
+
635
+ print(f"[DOCX Handler] Exporting bilingual document: {output_path}")
636
+
637
+ doc = Document()
638
+ doc.add_heading('Bilingual Translation Document', 0)
639
+
640
+ # Create table
641
+ table = doc.add_table(rows=1, cols=3)
642
+ table.style = 'Light Grid Accent 1'
643
+
644
+ # Header row
645
+ header_cells = table.rows[0].cells
646
+ header_cells[0].text = '#'
647
+ header_cells[1].text = 'Source'
648
+ header_cells[2].text = 'Target'
649
+
650
+ # Add segments - strip tags for clean display
651
+ for seg in segments:
652
+ row_cells = table.add_row().cells
653
+ row_cells[0].text = str(seg.get('id', ''))
654
+ row_cells[1].text = strip_tags(seg.get('source', ''))
655
+ row_cells[2].text = strip_tags(seg.get('target', ''))
656
+
657
+ doc.save(output_path)
658
+ print(f"[DOCX Handler] Bilingual export complete")
659
+
660
+ def get_document_info(self) -> Dict[str, Any]:
661
+ """Get information about the loaded document"""
662
+ if not self.original_document:
663
+ return {}
664
+
665
+ # Count table cells
666
+ table_cells = sum(1 for info in self.paragraphs_info if info.is_table_cell)
667
+ regular_paras = sum(1 for info in self.paragraphs_info if not info.is_table_cell)
668
+
669
+ return {
670
+ 'paragraphs': len(self.original_document.paragraphs),
671
+ 'sections': len(self.original_document.sections),
672
+ 'tables': len(self.original_document.tables),
673
+ 'table_cells': table_cells,
674
+ 'regular_paragraphs': regular_paras,
675
+ 'total_items': len(self.paragraphs_info),
676
+ 'path': self.original_path
677
+ }
678
+
679
+
680
+ # Quick test
681
+ if __name__ == "__main__":
682
+ print("DOCX Handler Test")
683
+ print("To test, you need a sample DOCX file.")
684
+
685
+ if DOCX_AVAILABLE:
686
+ print("✓ python-docx is installed")
687
+ else:
688
+ print("✗ python-docx is NOT installed")
689
+ print(" Run: pip install python-docx")