supervertaler 1.9.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +47886 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1878 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +333 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1172 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.153.dist-info/METADATA +896 -0
- supervertaler-1.9.153.dist-info/RECORD +85 -0
- supervertaler-1.9.153.dist-info/WHEEL +5 -0
- supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.153.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Trados Bilingual DOCX Handler (Review Files)
|
|
3
|
+
|
|
4
|
+
This module handles the import and export of Trados Studio bilingual review DOCX files.
|
|
5
|
+
Trados uses a table-based format with numbered inline tags.
|
|
6
|
+
|
|
7
|
+
Format Structure:
|
|
8
|
+
- Table with columns: Segment ID | Segment status | Source segment | Target segment
|
|
9
|
+
- Tags use character style "Tag" and format: <N>text</N>
|
|
10
|
+
- Segment IDs are GUIDs with numeric prefixes
|
|
11
|
+
- Statuses: "Not Translated", "Draft", "Translated", etc.
|
|
12
|
+
|
|
13
|
+
Critical for re-import:
|
|
14
|
+
- Tags MUST preserve the "Tag" character style
|
|
15
|
+
- Tag numbers must match between source and target
|
|
16
|
+
- Segment IDs must remain unchanged
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
from docx import Document
|
|
22
|
+
from docx.shared import RGBColor, Pt
|
|
23
|
+
from docx.oxml.ns import qn
|
|
24
|
+
from lxml import etree
|
|
25
|
+
from typing import List, Dict, Tuple, Optional
|
|
26
|
+
from copy import deepcopy
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TradosSegment:
|
|
30
|
+
"""
|
|
31
|
+
Represents a Trados segment with tag information.
|
|
32
|
+
"""
|
|
33
|
+
def __init__(self, segment_id: str, status: str, source_text: str, target_text: str = "",
|
|
34
|
+
source_runs: List[Dict] = None, row_index: int = 0):
|
|
35
|
+
self.segment_id = segment_id
|
|
36
|
+
self.status = status
|
|
37
|
+
self.source_text = source_text # Plain text with tags as text
|
|
38
|
+
self.target_text = target_text
|
|
39
|
+
self.source_runs = source_runs or [] # List of {text, is_tag, style_xml} dicts
|
|
40
|
+
self.row_index = row_index
|
|
41
|
+
|
|
42
|
+
# Extract tags from source for validation
|
|
43
|
+
self.source_tags = self._extract_tags(source_text)
|
|
44
|
+
|
|
45
|
+
def _extract_tags(self, text: str) -> List[str]:
|
|
46
|
+
"""Extract all tags from text."""
|
|
47
|
+
pattern = r'</?(\d+)>'
|
|
48
|
+
return re.findall(pattern, text)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def plain_source(self) -> str:
|
|
52
|
+
"""Get source text without tags for translation."""
|
|
53
|
+
return re.sub(r'</?(\d+)>', '', self.source_text)
|
|
54
|
+
|
|
55
|
+
def __repr__(self):
|
|
56
|
+
return f"TradosSegment(id={self.segment_id[:20]}..., status={self.status}, source={self.source_text[:40]}...)"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class TradosDOCXHandler:
|
|
60
|
+
"""
|
|
61
|
+
Handler for Trados Studio bilingual review DOCX files.
|
|
62
|
+
|
|
63
|
+
This class provides methods to:
|
|
64
|
+
- Load and parse Trados bilingual review DOCX files
|
|
65
|
+
- Extract source segments with tag markers
|
|
66
|
+
- Update target segments with translations (preserving tag style)
|
|
67
|
+
- Save modified files ready for re-import to Trados
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
# Trados tag pattern: <N> or </N> where N is a number
|
|
71
|
+
TAG_PATTERN = re.compile(r'(</?(\d+)>)')
|
|
72
|
+
|
|
73
|
+
def __init__(self):
|
|
74
|
+
self.doc = None
|
|
75
|
+
self.table = None
|
|
76
|
+
self.segments: List[TradosSegment] = []
|
|
77
|
+
self.file_path = None
|
|
78
|
+
self.header_row = None
|
|
79
|
+
self.tag_style_xml = None # Store the Tag style XML for reuse
|
|
80
|
+
|
|
81
|
+
def load(self, file_path: str) -> bool:
|
|
82
|
+
"""
|
|
83
|
+
Load a Trados bilingual review DOCX file.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
file_path: Path to the Trados bilingual DOCX file
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
bool: True if loaded successfully, False otherwise
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
self.file_path = file_path
|
|
93
|
+
self.doc = Document(file_path)
|
|
94
|
+
|
|
95
|
+
# Trados bilingual files should have exactly one table
|
|
96
|
+
if len(self.doc.tables) == 0:
|
|
97
|
+
print(f"ERROR: No table found in {file_path}")
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
self.table = self.doc.tables[0]
|
|
101
|
+
|
|
102
|
+
# Verify the header row
|
|
103
|
+
if len(self.table.rows) < 2:
|
|
104
|
+
print(f"ERROR: Table has insufficient rows")
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
self.header_row = [cell.text.strip() for cell in self.table.rows[0].cells]
|
|
108
|
+
|
|
109
|
+
# Check if this looks like a Trados bilingual DOCX
|
|
110
|
+
expected_headers = ['Segment ID', 'Segment status', 'Source segment', 'Target segment']
|
|
111
|
+
if self.header_row != expected_headers:
|
|
112
|
+
print(f"WARNING: Headers don't match expected Trados format")
|
|
113
|
+
print(f" Expected: {expected_headers}")
|
|
114
|
+
print(f" Found: {self.header_row}")
|
|
115
|
+
# Continue anyway if it's close enough
|
|
116
|
+
if 'Segment' not in self.header_row[0]:
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
# Find and store the Tag style XML for later use
|
|
120
|
+
self._capture_tag_style()
|
|
121
|
+
|
|
122
|
+
print(f"Successfully loaded Trados bilingual DOCX: {file_path}")
|
|
123
|
+
print(f"Header: {self.header_row}")
|
|
124
|
+
print(f"Total rows (including header): {len(self.table.rows)}")
|
|
125
|
+
|
|
126
|
+
return True
|
|
127
|
+
|
|
128
|
+
except Exception as e:
|
|
129
|
+
print(f"ERROR loading Trados DOCX: {e}")
|
|
130
|
+
import traceback
|
|
131
|
+
traceback.print_exc()
|
|
132
|
+
return False
|
|
133
|
+
|
|
134
|
+
def _capture_tag_style(self):
|
|
135
|
+
"""Find and capture the Tag style XML from the document."""
|
|
136
|
+
try:
|
|
137
|
+
# Look through the document for a run with Tag style
|
|
138
|
+
for row in self.table.rows[1:]:
|
|
139
|
+
source_cell = row.cells[2]
|
|
140
|
+
for para in source_cell.paragraphs:
|
|
141
|
+
for run in para.runs:
|
|
142
|
+
rPr = run._r.find(qn('w:rPr'))
|
|
143
|
+
if rPr is not None:
|
|
144
|
+
style_elem = rPr.find(qn('w:rStyle'))
|
|
145
|
+
if style_elem is not None and style_elem.get(qn('w:val')) == 'Tag':
|
|
146
|
+
# Found a Tag style - save the entire rPr as template
|
|
147
|
+
self.tag_style_xml = deepcopy(rPr)
|
|
148
|
+
print("Captured Tag style from document")
|
|
149
|
+
return
|
|
150
|
+
except Exception as e:
|
|
151
|
+
print(f"Warning: Could not capture Tag style: {e}")
|
|
152
|
+
|
|
153
|
+
def extract_source_segments(self) -> List[TradosSegment]:
|
|
154
|
+
"""
|
|
155
|
+
Extract all source segments from the Trados bilingual DOCX.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
list: List of TradosSegment objects
|
|
159
|
+
"""
|
|
160
|
+
self.segments = []
|
|
161
|
+
|
|
162
|
+
if not self.table:
|
|
163
|
+
print("ERROR: No table loaded")
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
# Skip header row (index 0), process data rows
|
|
167
|
+
for i, row in enumerate(self.table.rows[1:], start=1):
|
|
168
|
+
try:
|
|
169
|
+
cells = row.cells
|
|
170
|
+
|
|
171
|
+
# Extract data from columns
|
|
172
|
+
segment_id = cells[0].text.strip()
|
|
173
|
+
status = cells[1].text.strip()
|
|
174
|
+
source_cell = cells[2]
|
|
175
|
+
target_cell = cells[3] if len(cells) > 3 else None
|
|
176
|
+
|
|
177
|
+
# Get source text
|
|
178
|
+
source_text = source_cell.text.strip()
|
|
179
|
+
target_text = target_cell.text.strip() if target_cell else ""
|
|
180
|
+
|
|
181
|
+
# Extract run information for preserving tag styles
|
|
182
|
+
source_runs = self._extract_runs_with_styles(source_cell)
|
|
183
|
+
|
|
184
|
+
# Create TradosSegment
|
|
185
|
+
segment = TradosSegment(
|
|
186
|
+
segment_id=segment_id,
|
|
187
|
+
status=status,
|
|
188
|
+
source_text=source_text,
|
|
189
|
+
target_text=target_text,
|
|
190
|
+
source_runs=source_runs,
|
|
191
|
+
row_index=i
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
self.segments.append(segment)
|
|
195
|
+
|
|
196
|
+
except Exception as e:
|
|
197
|
+
print(f"WARNING: Error processing row {i}: {e}")
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
print(f"Extracted {len(self.segments)} segments from Trados DOCX")
|
|
201
|
+
return self.segments
|
|
202
|
+
|
|
203
|
+
def _extract_runs_with_styles(self, cell) -> List[Dict]:
|
|
204
|
+
"""
|
|
205
|
+
Extract runs from a cell, noting which are tags.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
List of dicts with: {text, is_tag, style_xml}
|
|
209
|
+
"""
|
|
210
|
+
runs = []
|
|
211
|
+
for para in cell.paragraphs:
|
|
212
|
+
for run in para.runs:
|
|
213
|
+
is_tag = False
|
|
214
|
+
style_xml = None
|
|
215
|
+
|
|
216
|
+
# Check if this run has Tag style
|
|
217
|
+
rPr = run._r.find(qn('w:rPr'))
|
|
218
|
+
if rPr is not None:
|
|
219
|
+
style_elem = rPr.find(qn('w:rStyle'))
|
|
220
|
+
if style_elem is not None and style_elem.get(qn('w:val')) == 'Tag':
|
|
221
|
+
is_tag = True
|
|
222
|
+
style_xml = deepcopy(rPr)
|
|
223
|
+
|
|
224
|
+
runs.append({
|
|
225
|
+
'text': run.text,
|
|
226
|
+
'is_tag': is_tag,
|
|
227
|
+
'style_xml': style_xml
|
|
228
|
+
})
|
|
229
|
+
|
|
230
|
+
return runs
|
|
231
|
+
|
|
232
|
+
def update_target_segments(self, translations: Dict[int, str]) -> int:
|
|
233
|
+
"""
|
|
234
|
+
Update target segments with translations.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
translations: Dict mapping row index to translated text
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
int: Number of segments updated
|
|
241
|
+
"""
|
|
242
|
+
updated_count = 0
|
|
243
|
+
|
|
244
|
+
for idx, translation in translations.items():
|
|
245
|
+
if 0 < idx < len(self.table.rows):
|
|
246
|
+
row = self.table.rows[idx]
|
|
247
|
+
target_cell = row.cells[3]
|
|
248
|
+
|
|
249
|
+
# Get the source segment for tag info
|
|
250
|
+
source_cell = row.cells[2]
|
|
251
|
+
|
|
252
|
+
# Clear existing target content
|
|
253
|
+
for para in target_cell.paragraphs:
|
|
254
|
+
for run in list(para.runs):
|
|
255
|
+
run._r.getparent().remove(run._r)
|
|
256
|
+
|
|
257
|
+
# Write target with proper tag styling
|
|
258
|
+
self._write_text_with_tags(target_cell, translation, source_cell)
|
|
259
|
+
|
|
260
|
+
# Update status to indicate translation
|
|
261
|
+
status_cell = row.cells[1]
|
|
262
|
+
if status_cell.text.strip() == "Not Translated":
|
|
263
|
+
self._set_cell_text(status_cell, "Translated")
|
|
264
|
+
|
|
265
|
+
updated_count += 1
|
|
266
|
+
|
|
267
|
+
print(f"Updated {updated_count} target segments")
|
|
268
|
+
return updated_count
|
|
269
|
+
|
|
270
|
+
def _write_text_with_tags(self, target_cell, text: str, source_cell):
|
|
271
|
+
"""
|
|
272
|
+
Write text to target cell, applying Tag style to tag patterns.
|
|
273
|
+
|
|
274
|
+
This ensures tags in the target have the same style as in the source,
|
|
275
|
+
which is critical for re-import into Trados.
|
|
276
|
+
"""
|
|
277
|
+
if not target_cell.paragraphs:
|
|
278
|
+
return
|
|
279
|
+
|
|
280
|
+
para = target_cell.paragraphs[0]
|
|
281
|
+
|
|
282
|
+
# Use finditer to find all tags and their positions
|
|
283
|
+
# This avoids the complexity of split() with capturing groups
|
|
284
|
+
tag_pattern = re.compile(r'</?(\d+)>')
|
|
285
|
+
|
|
286
|
+
last_end = 0
|
|
287
|
+
for match in tag_pattern.finditer(text):
|
|
288
|
+
# Add any text before this tag
|
|
289
|
+
if match.start() > last_end:
|
|
290
|
+
plain_text = text[last_end:match.start()]
|
|
291
|
+
if plain_text:
|
|
292
|
+
run = para.add_run(plain_text)
|
|
293
|
+
self._apply_default_style(run)
|
|
294
|
+
self._set_xml_space_preserve(run)
|
|
295
|
+
|
|
296
|
+
# Add the tag itself with Tag style
|
|
297
|
+
tag_text = match.group(0) # e.g., "<11>" or "</11>"
|
|
298
|
+
run = para.add_run(tag_text)
|
|
299
|
+
self._apply_tag_style(run)
|
|
300
|
+
self._set_xml_space_preserve(run)
|
|
301
|
+
|
|
302
|
+
last_end = match.end()
|
|
303
|
+
|
|
304
|
+
# Add any remaining text after the last tag
|
|
305
|
+
if last_end < len(text):
|
|
306
|
+
remaining_text = text[last_end:]
|
|
307
|
+
if remaining_text:
|
|
308
|
+
run = para.add_run(remaining_text)
|
|
309
|
+
self._apply_default_style(run)
|
|
310
|
+
self._set_xml_space_preserve(run)
|
|
311
|
+
|
|
312
|
+
def _set_xml_space_preserve(self, run):
|
|
313
|
+
"""Set xml:space='preserve' on the run's text element for proper whitespace handling."""
|
|
314
|
+
t_elem = run._r.find(qn('w:t'))
|
|
315
|
+
if t_elem is not None:
|
|
316
|
+
t_elem.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
|
317
|
+
|
|
318
|
+
def _apply_tag_style(self, run):
|
|
319
|
+
"""Apply the Tag character style to a run."""
|
|
320
|
+
# Create rPr element if needed
|
|
321
|
+
rPr = run._r.find(qn('w:rPr'))
|
|
322
|
+
if rPr is None:
|
|
323
|
+
rPr = etree.SubElement(run._r, qn('w:rPr'))
|
|
324
|
+
run._r.insert(0, rPr)
|
|
325
|
+
|
|
326
|
+
# Add rStyle element with Tag value
|
|
327
|
+
style_elem = rPr.find(qn('w:rStyle'))
|
|
328
|
+
if style_elem is None:
|
|
329
|
+
style_elem = etree.SubElement(rPr, qn('w:rStyle'))
|
|
330
|
+
style_elem.set(qn('w:val'), 'Tag')
|
|
331
|
+
|
|
332
|
+
def _apply_default_style(self, run):
|
|
333
|
+
"""Apply default style (language settings) to a run.
|
|
334
|
+
|
|
335
|
+
Note: For target text, we DON'T set language at run level.
|
|
336
|
+
The paragraph has its own default language (en-US for target),
|
|
337
|
+
and runs will inherit from that. Setting the source language
|
|
338
|
+
on target runs would confuse Trados.
|
|
339
|
+
"""
|
|
340
|
+
# We intentionally don't set language here anymore
|
|
341
|
+
# Target runs should inherit from paragraph-level language setting
|
|
342
|
+
pass
|
|
343
|
+
|
|
344
|
+
def _set_cell_text(self, cell, text: str):
|
|
345
|
+
"""Set cell text, preserving formatting."""
|
|
346
|
+
if cell.paragraphs:
|
|
347
|
+
para = cell.paragraphs[0]
|
|
348
|
+
# Clear existing runs
|
|
349
|
+
for run in list(para.runs):
|
|
350
|
+
run._r.getparent().remove(run._r)
|
|
351
|
+
para.add_run(text)
|
|
352
|
+
|
|
353
|
+
def save(self, output_path: str = None) -> bool:
|
|
354
|
+
"""
|
|
355
|
+
Save the modified document.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
output_path: Path to save to (defaults to original path)
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
bool: True if saved successfully
|
|
362
|
+
"""
|
|
363
|
+
try:
|
|
364
|
+
save_path = output_path or self.file_path
|
|
365
|
+
self.doc.save(save_path)
|
|
366
|
+
print(f"Saved Trados bilingual DOCX: {save_path}")
|
|
367
|
+
return True
|
|
368
|
+
except Exception as e:
|
|
369
|
+
print(f"ERROR saving Trados DOCX: {e}")
|
|
370
|
+
import traceback
|
|
371
|
+
traceback.print_exc()
|
|
372
|
+
return False
|
|
373
|
+
|
|
374
|
+
def get_segments_for_translation(self) -> List[Tuple[int, str, str]]:
|
|
375
|
+
"""
|
|
376
|
+
Get segments that need translation.
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
List of (row_index, source_text, plain_source) tuples
|
|
380
|
+
"""
|
|
381
|
+
result = []
|
|
382
|
+
for seg in self.segments:
|
|
383
|
+
if seg.status == "Not Translated" or not seg.target_text:
|
|
384
|
+
result.append((seg.row_index, seg.source_text, seg.plain_source))
|
|
385
|
+
return result
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def detect_bilingual_docx_type(file_path: str) -> str:
|
|
389
|
+
"""
|
|
390
|
+
Detect the type of bilingual DOCX file.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
str: "trados", "cafetran", "memoq", "phrase", or "unknown"
|
|
394
|
+
"""
|
|
395
|
+
try:
|
|
396
|
+
doc = Document(file_path)
|
|
397
|
+
|
|
398
|
+
if len(doc.tables) == 0:
|
|
399
|
+
return "unknown"
|
|
400
|
+
|
|
401
|
+
table = doc.tables[0]
|
|
402
|
+
if len(table.rows) < 1:
|
|
403
|
+
return "unknown"
|
|
404
|
+
|
|
405
|
+
headers = [cell.text.strip() for cell in table.rows[0].cells]
|
|
406
|
+
|
|
407
|
+
# Trados: Segment ID | Segment status | Source segment | Target segment
|
|
408
|
+
if headers and headers[0] == "Segment ID" and "Segment status" in headers:
|
|
409
|
+
return "trados"
|
|
410
|
+
|
|
411
|
+
# CafeTran: ID | filename | filename | Notes | *
|
|
412
|
+
if headers and headers[0] == "ID":
|
|
413
|
+
return "cafetran"
|
|
414
|
+
|
|
415
|
+
# Phrase (Memsource): Check for multiple large tables with 7-8 columns and segment IDs containing ':'
|
|
416
|
+
# Look for content tables with Phrase characteristics
|
|
417
|
+
for table in doc.tables:
|
|
418
|
+
if len(table.rows) > 100 and len(table.rows[0].cells) >= 7:
|
|
419
|
+
first_cell = table.rows[0].cells[0].text.strip()
|
|
420
|
+
if ':' in first_cell: # Segment IDs have format "xxx:nnn"
|
|
421
|
+
return "phrase"
|
|
422
|
+
|
|
423
|
+
# memoQ: Usually has different structure
|
|
424
|
+
# TODO: Add memoQ detection
|
|
425
|
+
|
|
426
|
+
return "unknown"
|
|
427
|
+
|
|
428
|
+
except Exception as e:
|
|
429
|
+
print(f"Error detecting bilingual DOCX type: {e}")
|
|
430
|
+
return "unknown"
|