supervertaler 1.9.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +47886 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1878 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +333 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1172 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.153.dist-info/METADATA +896 -0
- supervertaler-1.9.153.dist-info/RECORD +85 -0
- supervertaler-1.9.153.dist-info/WHEEL +5 -0
- supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.153.dist-info/top_level.txt +2 -0
modules/supercleaner.py
ADDED
|
@@ -0,0 +1,600 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Supercleaner Module for Supervertaler
|
|
3
|
+
======================================
|
|
4
|
+
|
|
5
|
+
Cleans up DOCX documents before translation by removing formatting issues,
|
|
6
|
+
excessive tags, and OCR artifacts. Combines functionality similar to:
|
|
7
|
+
- TransTools Document Cleaner (tag/formatting cleanup)
|
|
8
|
+
- TransTools Unbreaker (incorrect line break removal)
|
|
9
|
+
|
|
10
|
+
Author: Michael Beijer / Supervertaler
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from docx import Document
|
|
14
|
+
from docx.shared import RGBColor, Pt
|
|
15
|
+
from docx.enum.text import WD_COLOR_INDEX
|
|
16
|
+
import re
|
|
17
|
+
from typing import List, Dict, Any
|
|
18
|
+
import logging
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DocumentCleaner:
|
|
22
|
+
"""
|
|
23
|
+
Clean DOCX documents by removing formatting issues and excessive tags.
|
|
24
|
+
Also includes Unbreaker functionality to fix incorrect line/paragraph breaks.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self):
|
|
28
|
+
self.logger = logging.getLogger(__name__)
|
|
29
|
+
self.operations_performed = []
|
|
30
|
+
self.incorrect_breaks_found = []
|
|
31
|
+
|
|
32
|
+
def clean_document(self, input_path: str, output_path: str, operations: Dict[str, bool]) -> Dict[str, Any]:
|
|
33
|
+
"""
|
|
34
|
+
Clean a DOCX document based on selected operations
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
input_path: Path to input DOCX file
|
|
38
|
+
output_path: Path to save cleaned DOCX file
|
|
39
|
+
operations: Dictionary of operation names and whether to perform them
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Dictionary with statistics about operations performed
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
doc = Document(input_path)
|
|
46
|
+
stats = {
|
|
47
|
+
'paragraphs_processed': 0,
|
|
48
|
+
'runs_processed': 0,
|
|
49
|
+
'changes_made': 0,
|
|
50
|
+
'operations': []
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Process all paragraphs
|
|
54
|
+
for paragraph in doc.paragraphs:
|
|
55
|
+
stats['paragraphs_processed'] += 1
|
|
56
|
+
|
|
57
|
+
# Process all runs in paragraph
|
|
58
|
+
for run in paragraph.runs:
|
|
59
|
+
stats['runs_processed'] += 1
|
|
60
|
+
|
|
61
|
+
# Perform selected operations
|
|
62
|
+
if operations.get('remove_text_shading', False):
|
|
63
|
+
if self._remove_text_shading(run):
|
|
64
|
+
stats['changes_made'] += 1
|
|
65
|
+
|
|
66
|
+
if operations.get('remove_highlighting', False):
|
|
67
|
+
if self._remove_highlighting(run):
|
|
68
|
+
stats['changes_made'] += 1
|
|
69
|
+
|
|
70
|
+
if operations.get('font_color_to_automatic', False):
|
|
71
|
+
if self._set_font_color_automatic(run):
|
|
72
|
+
stats['changes_made'] += 1
|
|
73
|
+
|
|
74
|
+
if operations.get('remove_character_styles', False):
|
|
75
|
+
if self._remove_character_styles(run):
|
|
76
|
+
stats['changes_made'] += 1
|
|
77
|
+
|
|
78
|
+
# Paragraph-level operations
|
|
79
|
+
if operations.get('normalize_font_color', False):
|
|
80
|
+
if self._normalize_paragraph_font_color(paragraph):
|
|
81
|
+
stats['changes_made'] += 1
|
|
82
|
+
|
|
83
|
+
if operations.get('normalize_font_size', False):
|
|
84
|
+
if self._normalize_paragraph_font_size(paragraph):
|
|
85
|
+
stats['changes_made'] += 1
|
|
86
|
+
|
|
87
|
+
if operations.get('normalize_font', False):
|
|
88
|
+
if self._normalize_paragraph_font(paragraph):
|
|
89
|
+
stats['changes_made'] += 1
|
|
90
|
+
|
|
91
|
+
if operations.get('set_default_spacing', False):
|
|
92
|
+
if self._set_default_spacing(paragraph):
|
|
93
|
+
stats['changes_made'] += 1
|
|
94
|
+
|
|
95
|
+
# Text content operations
|
|
96
|
+
if operations.get('remove_manual_hyphens', False):
|
|
97
|
+
count = self._remove_manual_hyphens(doc)
|
|
98
|
+
stats['changes_made'] += count
|
|
99
|
+
if count > 0:
|
|
100
|
+
stats['operations'].append(f"Removed {count} manual hyphens")
|
|
101
|
+
|
|
102
|
+
if operations.get('replace_special_symbols', False):
|
|
103
|
+
count = self._replace_special_symbols(doc)
|
|
104
|
+
stats['changes_made'] += count
|
|
105
|
+
if count > 0:
|
|
106
|
+
stats['operations'].append(f"Replaced {count} special symbols")
|
|
107
|
+
|
|
108
|
+
if operations.get('simplify_quotes_and_dashes', False):
|
|
109
|
+
count = self._simplify_quotes_and_dashes(doc)
|
|
110
|
+
stats['changes_made'] += count
|
|
111
|
+
if count > 0:
|
|
112
|
+
stats['operations'].append(f"Simplified {count} quotes/dashes to ASCII")
|
|
113
|
+
|
|
114
|
+
# Unbreaker operations - fix incorrect line/paragraph breaks
|
|
115
|
+
if operations.get('fix_line_breaks', False):
|
|
116
|
+
count = self._fix_incorrect_line_breaks(doc)
|
|
117
|
+
stats['changes_made'] += count
|
|
118
|
+
if count > 0:
|
|
119
|
+
stats['operations'].append(f"Fixed {count} incorrect line breaks")
|
|
120
|
+
|
|
121
|
+
if operations.get('join_broken_sentences', False):
|
|
122
|
+
count = self._join_broken_sentences(doc)
|
|
123
|
+
stats['changes_made'] += count
|
|
124
|
+
if count > 0:
|
|
125
|
+
stats['operations'].append(f"Joined {count} broken sentences")
|
|
126
|
+
|
|
127
|
+
# Remove excessive spaces
|
|
128
|
+
if operations.get('remove_excessive_spaces', False):
|
|
129
|
+
count = self._remove_excessive_spaces(doc)
|
|
130
|
+
stats['changes_made'] += count
|
|
131
|
+
if count > 0:
|
|
132
|
+
stats['operations'].append(f"Cleaned up {count} runs with excessive spaces")
|
|
133
|
+
|
|
134
|
+
if operations.get('accept_tracked_changes', False):
|
|
135
|
+
# Note: python-docx doesn't fully support tracked changes
|
|
136
|
+
# This would require a more complex implementation
|
|
137
|
+
stats['operations'].append("Tracked changes acceptance not yet implemented")
|
|
138
|
+
|
|
139
|
+
# Save cleaned document
|
|
140
|
+
doc.save(output_path)
|
|
141
|
+
|
|
142
|
+
return stats
|
|
143
|
+
|
|
144
|
+
except Exception as e:
|
|
145
|
+
self.logger.error(f"Error cleaning document: {e}")
|
|
146
|
+
raise
|
|
147
|
+
|
|
148
|
+
def _remove_text_shading(self, run) -> bool:
|
|
149
|
+
"""Remove background shading from text run"""
|
|
150
|
+
try:
|
|
151
|
+
if run.font.highlight_color is not None or hasattr(run._element, 'shd'):
|
|
152
|
+
# Remove shading from the run element
|
|
153
|
+
shd = run._element.get_or_add_rPr().find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}shd')
|
|
154
|
+
if shd is not None:
|
|
155
|
+
shd.getparent().remove(shd)
|
|
156
|
+
return True
|
|
157
|
+
except Exception:
|
|
158
|
+
pass
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
def _remove_highlighting(self, run) -> bool:
|
|
162
|
+
"""Remove text highlighting"""
|
|
163
|
+
try:
|
|
164
|
+
if run.font.highlight_color is not None:
|
|
165
|
+
run.font.highlight_color = None
|
|
166
|
+
return True
|
|
167
|
+
except Exception:
|
|
168
|
+
pass
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
def _set_font_color_automatic(self, run) -> bool:
|
|
172
|
+
"""Change font color from explicit colors to automatic"""
|
|
173
|
+
try:
|
|
174
|
+
if run.font.color is not None and run.font.color.rgb is not None:
|
|
175
|
+
# Set to automatic (None)
|
|
176
|
+
run.font.color.rgb = None
|
|
177
|
+
return True
|
|
178
|
+
except Exception:
|
|
179
|
+
pass
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
def _remove_character_styles(self, run) -> bool:
|
|
183
|
+
"""Remove character styles, keeping only direct formatting"""
|
|
184
|
+
try:
|
|
185
|
+
if run.style is not None and run.style.name != 'Default Paragraph Font':
|
|
186
|
+
run.style = None
|
|
187
|
+
return True
|
|
188
|
+
except Exception:
|
|
189
|
+
pass
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
def _normalize_paragraph_font_color(self, paragraph) -> bool:
|
|
193
|
+
"""Normalize font color across all runs in paragraph to the most common color"""
|
|
194
|
+
try:
|
|
195
|
+
if not paragraph.runs:
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
# Find most common color
|
|
199
|
+
colors = {}
|
|
200
|
+
for run in paragraph.runs:
|
|
201
|
+
if run.font.color and run.font.color.rgb:
|
|
202
|
+
color = run.font.color.rgb
|
|
203
|
+
colors[color] = colors.get(color, 0) + 1
|
|
204
|
+
|
|
205
|
+
if not colors:
|
|
206
|
+
return False
|
|
207
|
+
|
|
208
|
+
# Get most common color
|
|
209
|
+
most_common = max(colors, key=colors.get)
|
|
210
|
+
|
|
211
|
+
# Apply to all runs
|
|
212
|
+
changed = False
|
|
213
|
+
for run in paragraph.runs:
|
|
214
|
+
if run.font.color is None or run.font.color.rgb != most_common:
|
|
215
|
+
run.font.color.rgb = most_common
|
|
216
|
+
changed = True
|
|
217
|
+
|
|
218
|
+
return changed
|
|
219
|
+
except Exception:
|
|
220
|
+
pass
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
def _normalize_paragraph_font_size(self, paragraph) -> bool:
|
|
224
|
+
"""Normalize font size across all runs in paragraph to the most common size"""
|
|
225
|
+
try:
|
|
226
|
+
if not paragraph.runs:
|
|
227
|
+
return False
|
|
228
|
+
|
|
229
|
+
# Find most common size
|
|
230
|
+
sizes = {}
|
|
231
|
+
for run in paragraph.runs:
|
|
232
|
+
if run.font.size:
|
|
233
|
+
size = run.font.size
|
|
234
|
+
sizes[size] = sizes.get(size, 0) + 1
|
|
235
|
+
|
|
236
|
+
if not sizes:
|
|
237
|
+
return False
|
|
238
|
+
|
|
239
|
+
# Get most common size
|
|
240
|
+
most_common = max(sizes, key=sizes.get)
|
|
241
|
+
|
|
242
|
+
# Apply to all runs
|
|
243
|
+
changed = False
|
|
244
|
+
for run in paragraph.runs:
|
|
245
|
+
if run.font.size != most_common:
|
|
246
|
+
run.font.size = most_common
|
|
247
|
+
changed = True
|
|
248
|
+
|
|
249
|
+
return changed
|
|
250
|
+
except Exception:
|
|
251
|
+
pass
|
|
252
|
+
return False
|
|
253
|
+
|
|
254
|
+
def _normalize_paragraph_font(self, paragraph) -> bool:
|
|
255
|
+
"""Normalize font name across all runs in paragraph to the most common font"""
|
|
256
|
+
try:
|
|
257
|
+
if not paragraph.runs:
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
# Find most common font
|
|
261
|
+
fonts = {}
|
|
262
|
+
for run in paragraph.runs:
|
|
263
|
+
if run.font.name:
|
|
264
|
+
font = run.font.name
|
|
265
|
+
fonts[font] = fonts.get(font, 0) + 1
|
|
266
|
+
|
|
267
|
+
if not fonts:
|
|
268
|
+
return False
|
|
269
|
+
|
|
270
|
+
# Get most common font
|
|
271
|
+
most_common = max(fonts, key=fonts.get)
|
|
272
|
+
|
|
273
|
+
# Apply to all runs
|
|
274
|
+
changed = False
|
|
275
|
+
for run in paragraph.runs:
|
|
276
|
+
if run.font.name != most_common:
|
|
277
|
+
run.font.name = most_common
|
|
278
|
+
changed = True
|
|
279
|
+
|
|
280
|
+
return changed
|
|
281
|
+
except Exception:
|
|
282
|
+
pass
|
|
283
|
+
return False
|
|
284
|
+
|
|
285
|
+
def _set_default_spacing(self, paragraph) -> bool:
|
|
286
|
+
"""Set default paragraph spacing"""
|
|
287
|
+
try:
|
|
288
|
+
# Set line spacing to single (1.0)
|
|
289
|
+
if paragraph.paragraph_format.line_spacing != 1.0:
|
|
290
|
+
paragraph.paragraph_format.line_spacing = 1.0
|
|
291
|
+
return True
|
|
292
|
+
except Exception:
|
|
293
|
+
pass
|
|
294
|
+
return False
|
|
295
|
+
|
|
296
|
+
def _remove_manual_hyphens(self, doc) -> int:
|
|
297
|
+
"""Remove manual/soft hyphens from document"""
|
|
298
|
+
count = 0
|
|
299
|
+
try:
|
|
300
|
+
for paragraph in doc.paragraphs:
|
|
301
|
+
for run in paragraph.runs:
|
|
302
|
+
if '\u00AD' in run.text or '\u002D' in run.text: # Soft hyphen and regular hyphen
|
|
303
|
+
original = run.text
|
|
304
|
+
# Remove soft hyphens
|
|
305
|
+
run.text = run.text.replace('\u00AD', '')
|
|
306
|
+
# Remove hyphens at end of lines (manual hyphenation)
|
|
307
|
+
run.text = re.sub(r'-\s+', '', run.text)
|
|
308
|
+
if run.text != original:
|
|
309
|
+
count += 1
|
|
310
|
+
except Exception:
|
|
311
|
+
pass
|
|
312
|
+
return count
|
|
313
|
+
|
|
314
|
+
def _replace_special_symbols(self, doc) -> int:
|
|
315
|
+
"""Replace problematic special symbols (mainly non-breaking spaces and ellipsis)"""
|
|
316
|
+
count = 0
|
|
317
|
+
replacements = {
|
|
318
|
+
'\u2026': '...', # Ellipsis
|
|
319
|
+
'\u00A0': ' ', # Non-breaking space (important for TM matching)
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
try:
|
|
323
|
+
for paragraph in doc.paragraphs:
|
|
324
|
+
for run in paragraph.runs:
|
|
325
|
+
original = run.text
|
|
326
|
+
for special, regular in replacements.items():
|
|
327
|
+
run.text = run.text.replace(special, regular)
|
|
328
|
+
if run.text != original:
|
|
329
|
+
count += 1
|
|
330
|
+
except Exception:
|
|
331
|
+
pass
|
|
332
|
+
return count
|
|
333
|
+
|
|
334
|
+
def _simplify_quotes_and_dashes(self, doc) -> int:
|
|
335
|
+
"""Convert typographic quotes and dashes to simple ASCII equivalents (OPTIONAL)"""
|
|
336
|
+
count = 0
|
|
337
|
+
replacements = {
|
|
338
|
+
'\u2018': "'", # Left single quotation mark → straight apostrophe
|
|
339
|
+
'\u2019': "'", # Right single quotation mark → straight apostrophe
|
|
340
|
+
'\u201C': '"', # Left double quotation mark → straight quote
|
|
341
|
+
'\u201D': '"', # Right double quotation mark → straight quote
|
|
342
|
+
'\u2013': '-', # En dash → hyphen
|
|
343
|
+
'\u2014': '-', # Em dash → hyphen (NOT double hyphen)
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
try:
|
|
347
|
+
for paragraph in doc.paragraphs:
|
|
348
|
+
for run in paragraph.runs:
|
|
349
|
+
original = run.text
|
|
350
|
+
for special, regular in replacements.items():
|
|
351
|
+
run.text = run.text.replace(special, regular)
|
|
352
|
+
if run.text != original:
|
|
353
|
+
count += 1
|
|
354
|
+
except Exception:
|
|
355
|
+
pass
|
|
356
|
+
return count
|
|
357
|
+
|
|
358
|
+
# ============================================================================
|
|
359
|
+
# UNBREAKER FUNCTIONALITY - Fix incorrect line/paragraph breaks
|
|
360
|
+
# ============================================================================
|
|
361
|
+
|
|
362
|
+
def _fix_incorrect_line_breaks(self, doc) -> int:
|
|
363
|
+
"""
|
|
364
|
+
Fix incorrect line breaks (manual line breaks within sentences).
|
|
365
|
+
Detects line breaks that occur mid-sentence and removes them.
|
|
366
|
+
"""
|
|
367
|
+
count = 0
|
|
368
|
+
try:
|
|
369
|
+
for paragraph in doc.paragraphs:
|
|
370
|
+
# Check for line breaks within the paragraph text
|
|
371
|
+
original_text = paragraph.text
|
|
372
|
+
|
|
373
|
+
# Line break character in Word is '\v' or '\x0B'
|
|
374
|
+
if '\v' in original_text or '\x0B' in original_text:
|
|
375
|
+
# Check if these are likely incorrect (mid-sentence)
|
|
376
|
+
if self._is_likely_incorrect_break(original_text):
|
|
377
|
+
# Remove line breaks and replace with space
|
|
378
|
+
new_text = original_text.replace('\v', ' ').replace('\x0B', ' ')
|
|
379
|
+
# Clean up multiple spaces
|
|
380
|
+
new_text = re.sub(r'\s+', ' ', new_text)
|
|
381
|
+
|
|
382
|
+
# Update paragraph text
|
|
383
|
+
if paragraph.runs:
|
|
384
|
+
paragraph.runs[0].text = new_text
|
|
385
|
+
# Clear other runs
|
|
386
|
+
for i in range(len(paragraph.runs) - 1, 0, -1):
|
|
387
|
+
paragraph.runs[i].text = ''
|
|
388
|
+
count += 1
|
|
389
|
+
except Exception as e:
|
|
390
|
+
self.logger.error(f"Error fixing line breaks: {e}")
|
|
391
|
+
return count
|
|
392
|
+
|
|
393
|
+
def _join_broken_sentences(self, doc) -> int:
|
|
394
|
+
"""
|
|
395
|
+
Join sentences that were incorrectly split across paragraphs.
|
|
396
|
+
Detects paragraphs that don't end with sentence-ending punctuation
|
|
397
|
+
and joins them with the next paragraph.
|
|
398
|
+
|
|
399
|
+
DISABLED BY DEFAULT - This operation is too aggressive and causes
|
|
400
|
+
words to stick together. Needs more sophisticated logic to detect
|
|
401
|
+
true broken sentences vs intentional paragraph breaks.
|
|
402
|
+
"""
|
|
403
|
+
count = 0
|
|
404
|
+
# TEMPORARILY DISABLED due to word spacing bugs
|
|
405
|
+
# The current logic joins too many paragraphs incorrectly
|
|
406
|
+
return count
|
|
407
|
+
|
|
408
|
+
# Original code kept for reference but not executed:
|
|
409
|
+
# try:
|
|
410
|
+
# paragraphs = list(doc.paragraphs)
|
|
411
|
+
# i = 0
|
|
412
|
+
#
|
|
413
|
+
# while i < len(paragraphs) - 1:
|
|
414
|
+
# current_para = paragraphs[i]
|
|
415
|
+
# next_para = paragraphs[i + 1]
|
|
416
|
+
#
|
|
417
|
+
# current_text = current_para.text.strip()
|
|
418
|
+
# next_text = next_para.text.strip()
|
|
419
|
+
#
|
|
420
|
+
# # Skip empty paragraphs
|
|
421
|
+
# if not current_text or not next_text:
|
|
422
|
+
# i += 1
|
|
423
|
+
# continue
|
|
424
|
+
#
|
|
425
|
+
# # Check if current paragraph ends mid-sentence
|
|
426
|
+
# if self._is_broken_sentence(current_text):
|
|
427
|
+
# # Join paragraphs WITH PROPER SPACING
|
|
428
|
+
# joined_text = current_text + ' ' + next_text
|
|
429
|
+
#
|
|
430
|
+
# # Update current paragraph
|
|
431
|
+
# if current_para.runs:
|
|
432
|
+
# current_para.runs[0].text = joined_text
|
|
433
|
+
# # Clear other runs
|
|
434
|
+
# for j in range(len(current_para.runs) - 1, 0, -1):
|
|
435
|
+
# current_para.runs[j].text = ''
|
|
436
|
+
#
|
|
437
|
+
# # Clear next paragraph
|
|
438
|
+
# if next_para.runs:
|
|
439
|
+
# for run in next_para.runs:
|
|
440
|
+
# run.text = ''
|
|
441
|
+
#
|
|
442
|
+
# count += 1
|
|
443
|
+
#
|
|
444
|
+
# i += 1
|
|
445
|
+
#
|
|
446
|
+
# except Exception as e:
|
|
447
|
+
# self.logger.error(f"Error joining broken sentences: {e}")
|
|
448
|
+
# return count
|
|
449
|
+
|
|
450
|
+
def _is_likely_incorrect_break(self, text: str) -> bool:
|
|
451
|
+
"""Check if a line break is likely incorrect (mid-sentence)"""
|
|
452
|
+
# Line breaks before lowercase letters are often incorrect
|
|
453
|
+
if re.search(r'\v[a-z]', text) or re.search(r'\x0B[a-z]', text):
|
|
454
|
+
return True
|
|
455
|
+
# Line breaks not followed by capital letters or numbers
|
|
456
|
+
if re.search(r'\v[^A-Z0-9\s]', text) or re.search(r'\x0B[^A-Z0-9\s]', text):
|
|
457
|
+
return True
|
|
458
|
+
return False
|
|
459
|
+
|
|
460
|
+
def _is_broken_sentence(self, text: str) -> bool:
|
|
461
|
+
"""Check if text appears to be a broken sentence (doesn't end properly)"""
|
|
462
|
+
# Ends with sentence-ending punctuation
|
|
463
|
+
sentence_enders = ('.', '!', '?', ':', ';')
|
|
464
|
+
|
|
465
|
+
# Skip if ends with sentence-ending punctuation
|
|
466
|
+
if text.endswith(sentence_enders):
|
|
467
|
+
return False
|
|
468
|
+
|
|
469
|
+
# Likely broken if ends with lowercase letter
|
|
470
|
+
if text and text[-1].islower():
|
|
471
|
+
return True
|
|
472
|
+
|
|
473
|
+
# Likely broken if ends with comma
|
|
474
|
+
if text.endswith(','):
|
|
475
|
+
return True
|
|
476
|
+
|
|
477
|
+
# Likely broken if very short (less than 50 chars)
|
|
478
|
+
if len(text) < 50:
|
|
479
|
+
return True
|
|
480
|
+
|
|
481
|
+
return False
|
|
482
|
+
|
|
483
|
+
# ============================================================================
|
|
484
|
+
# REMOVE EXCESSIVE SPACES FUNCTIONALITY
|
|
485
|
+
# ============================================================================
|
|
486
|
+
|
|
487
|
+
def _remove_excessive_spaces(self, doc) -> int:
|
|
488
|
+
"""
|
|
489
|
+
Remove excessive spaces between words and around punctuation.
|
|
490
|
+
|
|
491
|
+
CRITICAL: We work on full paragraph text, not individual runs,
|
|
492
|
+
because runs are formatting boundaries and may split words.
|
|
493
|
+
Removing trailing spaces from runs causes words to stick together!
|
|
494
|
+
"""
|
|
495
|
+
count = 0
|
|
496
|
+
try:
|
|
497
|
+
for paragraph in doc.paragraphs:
|
|
498
|
+
original_text = paragraph.text
|
|
499
|
+
|
|
500
|
+
# Only process if there's text
|
|
501
|
+
if not original_text or not original_text.strip():
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
# Work on the full paragraph text
|
|
505
|
+
text = original_text
|
|
506
|
+
|
|
507
|
+
# Replace multiple spaces (2+) with single space
|
|
508
|
+
text = re.sub(r' +', ' ', text)
|
|
509
|
+
|
|
510
|
+
# Remove spaces before punctuation (but be careful with abbreviations)
|
|
511
|
+
text = re.sub(r' +([,;:!?)])', r'\1', text)
|
|
512
|
+
|
|
513
|
+
# Remove spaces after opening punctuation
|
|
514
|
+
text = re.sub(r'([(]) +', r'\1', text)
|
|
515
|
+
|
|
516
|
+
# Remove leading/trailing spaces from paragraph
|
|
517
|
+
text = text.strip()
|
|
518
|
+
|
|
519
|
+
# Only update if changed
|
|
520
|
+
if text != original_text:
|
|
521
|
+
# Reconstruct paragraph with cleaned text
|
|
522
|
+
# Keep the first run and put all text there, clear others
|
|
523
|
+
if paragraph.runs:
|
|
524
|
+
paragraph.runs[0].text = text
|
|
525
|
+
# Clear remaining runs
|
|
526
|
+
for i in range(len(paragraph.runs) - 1, 0, -1):
|
|
527
|
+
paragraph.runs[i].text = ''
|
|
528
|
+
count += 1
|
|
529
|
+
|
|
530
|
+
except Exception as e:
|
|
531
|
+
self.logger.error(f"Error removing excessive spaces: {e}")
|
|
532
|
+
return count
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def clean_document_simple(input_path: str, output_path: str = None,
|
|
536
|
+
quick_clean: bool = True) -> Dict[str, Any]:
|
|
537
|
+
"""
|
|
538
|
+
Convenience function for quick document cleaning with default settings
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
input_path: Path to input DOCX file
|
|
542
|
+
output_path: Path to save cleaned file (if None, overwrites input)
|
|
543
|
+
quick_clean: If True, applies common cleaning operations
|
|
544
|
+
|
|
545
|
+
Returns:
|
|
546
|
+
Statistics dictionary
|
|
547
|
+
"""
|
|
548
|
+
if output_path is None:
|
|
549
|
+
output_path = input_path
|
|
550
|
+
|
|
551
|
+
cleaner = DocumentCleaner()
|
|
552
|
+
|
|
553
|
+
# Default quick clean operations (most useful for OCR/PDF cleanup)
|
|
554
|
+
operations = {
|
|
555
|
+
# Document Cleaner operations
|
|
556
|
+
'remove_text_shading': quick_clean,
|
|
557
|
+
'remove_highlighting': quick_clean,
|
|
558
|
+
'font_color_to_automatic': quick_clean,
|
|
559
|
+
'normalize_font_color': quick_clean,
|
|
560
|
+
'normalize_font_size': quick_clean,
|
|
561
|
+
'normalize_font': quick_clean,
|
|
562
|
+
'set_default_spacing': quick_clean,
|
|
563
|
+
'remove_manual_hyphens': quick_clean,
|
|
564
|
+
'replace_special_symbols': quick_clean, # Only non-breaking spaces and ellipsis
|
|
565
|
+
'simplify_quotes_and_dashes': False, # OPTIONAL - converts curly quotes/em-dashes to ASCII
|
|
566
|
+
'remove_character_styles': False, # More aggressive, optional
|
|
567
|
+
|
|
568
|
+
# Unbreaker operations
|
|
569
|
+
'fix_line_breaks': quick_clean,
|
|
570
|
+
'join_broken_sentences': False, # DISABLED - too aggressive, causes word spacing issues
|
|
571
|
+
|
|
572
|
+
# Remove excessive spaces
|
|
573
|
+
'remove_excessive_spaces': quick_clean,
|
|
574
|
+
|
|
575
|
+
# Not yet implemented
|
|
576
|
+
'accept_tracked_changes': False,
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
return cleaner.clean_document(input_path, output_path, operations)
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
if __name__ == "__main__":
|
|
583
|
+
# Example usage
|
|
584
|
+
import sys
|
|
585
|
+
|
|
586
|
+
if len(sys.argv) < 2:
|
|
587
|
+
print("Usage: python document_cleaner.py input.docx [output.docx]")
|
|
588
|
+
sys.exit(1)
|
|
589
|
+
|
|
590
|
+
input_file = sys.argv[1]
|
|
591
|
+
output_file = sys.argv[2] if len(sys.argv) > 2 else input_file.replace('.docx', '_cleaned.docx')
|
|
592
|
+
|
|
593
|
+
print(f"Cleaning document: {input_file}")
|
|
594
|
+
stats = clean_document_simple(input_file, output_file)
|
|
595
|
+
|
|
596
|
+
print(f"\nCleaning complete!")
|
|
597
|
+
print(f" Paragraphs processed: {stats['paragraphs_processed']}")
|
|
598
|
+
print(f" Runs processed: {stats['runs_processed']}")
|
|
599
|
+
print(f" Changes made: {stats['changes_made']}")
|
|
600
|
+
print(f" Output saved to: {output_file}")
|