supervertaler 1.9.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

Files changed (85) hide show
  1. Supervertaler.py +47886 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1878 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1172 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.153.dist-info/METADATA +896 -0
  81. supervertaler-1.9.153.dist-info/RECORD +85 -0
  82. supervertaler-1.9.153.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.153.dist-info/top_level.txt +2 -0
@@ -0,0 +1,600 @@
1
+ """
2
+ Supercleaner Module for Supervertaler
3
+ ======================================
4
+
5
+ Cleans up DOCX documents before translation by removing formatting issues,
6
+ excessive tags, and OCR artifacts. Combines functionality similar to:
7
+ - TransTools Document Cleaner (tag/formatting cleanup)
8
+ - TransTools Unbreaker (incorrect line break removal)
9
+
10
+ Author: Michael Beijer / Supervertaler
11
+ """
12
+
13
+ from docx import Document
14
+ from docx.shared import RGBColor, Pt
15
+ from docx.enum.text import WD_COLOR_INDEX
16
+ import re
17
+ from typing import List, Dict, Any
18
+ import logging
19
+
20
+
21
+ class DocumentCleaner:
22
+ """
23
+ Clean DOCX documents by removing formatting issues and excessive tags.
24
+ Also includes Unbreaker functionality to fix incorrect line/paragraph breaks.
25
+ """
26
+
27
+ def __init__(self):
28
+ self.logger = logging.getLogger(__name__)
29
+ self.operations_performed = []
30
+ self.incorrect_breaks_found = []
31
+
32
+ def clean_document(self, input_path: str, output_path: str, operations: Dict[str, bool]) -> Dict[str, Any]:
33
+ """
34
+ Clean a DOCX document based on selected operations
35
+
36
+ Args:
37
+ input_path: Path to input DOCX file
38
+ output_path: Path to save cleaned DOCX file
39
+ operations: Dictionary of operation names and whether to perform them
40
+
41
+ Returns:
42
+ Dictionary with statistics about operations performed
43
+ """
44
+ try:
45
+ doc = Document(input_path)
46
+ stats = {
47
+ 'paragraphs_processed': 0,
48
+ 'runs_processed': 0,
49
+ 'changes_made': 0,
50
+ 'operations': []
51
+ }
52
+
53
+ # Process all paragraphs
54
+ for paragraph in doc.paragraphs:
55
+ stats['paragraphs_processed'] += 1
56
+
57
+ # Process all runs in paragraph
58
+ for run in paragraph.runs:
59
+ stats['runs_processed'] += 1
60
+
61
+ # Perform selected operations
62
+ if operations.get('remove_text_shading', False):
63
+ if self._remove_text_shading(run):
64
+ stats['changes_made'] += 1
65
+
66
+ if operations.get('remove_highlighting', False):
67
+ if self._remove_highlighting(run):
68
+ stats['changes_made'] += 1
69
+
70
+ if operations.get('font_color_to_automatic', False):
71
+ if self._set_font_color_automatic(run):
72
+ stats['changes_made'] += 1
73
+
74
+ if operations.get('remove_character_styles', False):
75
+ if self._remove_character_styles(run):
76
+ stats['changes_made'] += 1
77
+
78
+ # Paragraph-level operations
79
+ if operations.get('normalize_font_color', False):
80
+ if self._normalize_paragraph_font_color(paragraph):
81
+ stats['changes_made'] += 1
82
+
83
+ if operations.get('normalize_font_size', False):
84
+ if self._normalize_paragraph_font_size(paragraph):
85
+ stats['changes_made'] += 1
86
+
87
+ if operations.get('normalize_font', False):
88
+ if self._normalize_paragraph_font(paragraph):
89
+ stats['changes_made'] += 1
90
+
91
+ if operations.get('set_default_spacing', False):
92
+ if self._set_default_spacing(paragraph):
93
+ stats['changes_made'] += 1
94
+
95
+ # Text content operations
96
+ if operations.get('remove_manual_hyphens', False):
97
+ count = self._remove_manual_hyphens(doc)
98
+ stats['changes_made'] += count
99
+ if count > 0:
100
+ stats['operations'].append(f"Removed {count} manual hyphens")
101
+
102
+ if operations.get('replace_special_symbols', False):
103
+ count = self._replace_special_symbols(doc)
104
+ stats['changes_made'] += count
105
+ if count > 0:
106
+ stats['operations'].append(f"Replaced {count} special symbols")
107
+
108
+ if operations.get('simplify_quotes_and_dashes', False):
109
+ count = self._simplify_quotes_and_dashes(doc)
110
+ stats['changes_made'] += count
111
+ if count > 0:
112
+ stats['operations'].append(f"Simplified {count} quotes/dashes to ASCII")
113
+
114
+ # Unbreaker operations - fix incorrect line/paragraph breaks
115
+ if operations.get('fix_line_breaks', False):
116
+ count = self._fix_incorrect_line_breaks(doc)
117
+ stats['changes_made'] += count
118
+ if count > 0:
119
+ stats['operations'].append(f"Fixed {count} incorrect line breaks")
120
+
121
+ if operations.get('join_broken_sentences', False):
122
+ count = self._join_broken_sentences(doc)
123
+ stats['changes_made'] += count
124
+ if count > 0:
125
+ stats['operations'].append(f"Joined {count} broken sentences")
126
+
127
+ # Remove excessive spaces
128
+ if operations.get('remove_excessive_spaces', False):
129
+ count = self._remove_excessive_spaces(doc)
130
+ stats['changes_made'] += count
131
+ if count > 0:
132
+ stats['operations'].append(f"Cleaned up {count} runs with excessive spaces")
133
+
134
+ if operations.get('accept_tracked_changes', False):
135
+ # Note: python-docx doesn't fully support tracked changes
136
+ # This would require a more complex implementation
137
+ stats['operations'].append("Tracked changes acceptance not yet implemented")
138
+
139
+ # Save cleaned document
140
+ doc.save(output_path)
141
+
142
+ return stats
143
+
144
+ except Exception as e:
145
+ self.logger.error(f"Error cleaning document: {e}")
146
+ raise
147
+
148
+ def _remove_text_shading(self, run) -> bool:
149
+ """Remove background shading from text run"""
150
+ try:
151
+ if run.font.highlight_color is not None or hasattr(run._element, 'shd'):
152
+ # Remove shading from the run element
153
+ shd = run._element.get_or_add_rPr().find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}shd')
154
+ if shd is not None:
155
+ shd.getparent().remove(shd)
156
+ return True
157
+ except Exception:
158
+ pass
159
+ return False
160
+
161
+ def _remove_highlighting(self, run) -> bool:
162
+ """Remove text highlighting"""
163
+ try:
164
+ if run.font.highlight_color is not None:
165
+ run.font.highlight_color = None
166
+ return True
167
+ except Exception:
168
+ pass
169
+ return False
170
+
171
+ def _set_font_color_automatic(self, run) -> bool:
172
+ """Change font color from explicit colors to automatic"""
173
+ try:
174
+ if run.font.color is not None and run.font.color.rgb is not None:
175
+ # Set to automatic (None)
176
+ run.font.color.rgb = None
177
+ return True
178
+ except Exception:
179
+ pass
180
+ return False
181
+
182
+ def _remove_character_styles(self, run) -> bool:
183
+ """Remove character styles, keeping only direct formatting"""
184
+ try:
185
+ if run.style is not None and run.style.name != 'Default Paragraph Font':
186
+ run.style = None
187
+ return True
188
+ except Exception:
189
+ pass
190
+ return False
191
+
192
+ def _normalize_paragraph_font_color(self, paragraph) -> bool:
193
+ """Normalize font color across all runs in paragraph to the most common color"""
194
+ try:
195
+ if not paragraph.runs:
196
+ return False
197
+
198
+ # Find most common color
199
+ colors = {}
200
+ for run in paragraph.runs:
201
+ if run.font.color and run.font.color.rgb:
202
+ color = run.font.color.rgb
203
+ colors[color] = colors.get(color, 0) + 1
204
+
205
+ if not colors:
206
+ return False
207
+
208
+ # Get most common color
209
+ most_common = max(colors, key=colors.get)
210
+
211
+ # Apply to all runs
212
+ changed = False
213
+ for run in paragraph.runs:
214
+ if run.font.color is None or run.font.color.rgb != most_common:
215
+ run.font.color.rgb = most_common
216
+ changed = True
217
+
218
+ return changed
219
+ except Exception:
220
+ pass
221
+ return False
222
+
223
+ def _normalize_paragraph_font_size(self, paragraph) -> bool:
224
+ """Normalize font size across all runs in paragraph to the most common size"""
225
+ try:
226
+ if not paragraph.runs:
227
+ return False
228
+
229
+ # Find most common size
230
+ sizes = {}
231
+ for run in paragraph.runs:
232
+ if run.font.size:
233
+ size = run.font.size
234
+ sizes[size] = sizes.get(size, 0) + 1
235
+
236
+ if not sizes:
237
+ return False
238
+
239
+ # Get most common size
240
+ most_common = max(sizes, key=sizes.get)
241
+
242
+ # Apply to all runs
243
+ changed = False
244
+ for run in paragraph.runs:
245
+ if run.font.size != most_common:
246
+ run.font.size = most_common
247
+ changed = True
248
+
249
+ return changed
250
+ except Exception:
251
+ pass
252
+ return False
253
+
254
+ def _normalize_paragraph_font(self, paragraph) -> bool:
255
+ """Normalize font name across all runs in paragraph to the most common font"""
256
+ try:
257
+ if not paragraph.runs:
258
+ return False
259
+
260
+ # Find most common font
261
+ fonts = {}
262
+ for run in paragraph.runs:
263
+ if run.font.name:
264
+ font = run.font.name
265
+ fonts[font] = fonts.get(font, 0) + 1
266
+
267
+ if not fonts:
268
+ return False
269
+
270
+ # Get most common font
271
+ most_common = max(fonts, key=fonts.get)
272
+
273
+ # Apply to all runs
274
+ changed = False
275
+ for run in paragraph.runs:
276
+ if run.font.name != most_common:
277
+ run.font.name = most_common
278
+ changed = True
279
+
280
+ return changed
281
+ except Exception:
282
+ pass
283
+ return False
284
+
285
+ def _set_default_spacing(self, paragraph) -> bool:
286
+ """Set default paragraph spacing"""
287
+ try:
288
+ # Set line spacing to single (1.0)
289
+ if paragraph.paragraph_format.line_spacing != 1.0:
290
+ paragraph.paragraph_format.line_spacing = 1.0
291
+ return True
292
+ except Exception:
293
+ pass
294
+ return False
295
+
296
+ def _remove_manual_hyphens(self, doc) -> int:
297
+ """Remove manual/soft hyphens from document"""
298
+ count = 0
299
+ try:
300
+ for paragraph in doc.paragraphs:
301
+ for run in paragraph.runs:
302
+ if '\u00AD' in run.text or '\u002D' in run.text: # Soft hyphen and regular hyphen
303
+ original = run.text
304
+ # Remove soft hyphens
305
+ run.text = run.text.replace('\u00AD', '')
306
+ # Remove hyphens at end of lines (manual hyphenation)
307
+ run.text = re.sub(r'-\s+', '', run.text)
308
+ if run.text != original:
309
+ count += 1
310
+ except Exception:
311
+ pass
312
+ return count
313
+
314
+ def _replace_special_symbols(self, doc) -> int:
315
+ """Replace problematic special symbols (mainly non-breaking spaces and ellipsis)"""
316
+ count = 0
317
+ replacements = {
318
+ '\u2026': '...', # Ellipsis
319
+ '\u00A0': ' ', # Non-breaking space (important for TM matching)
320
+ }
321
+
322
+ try:
323
+ for paragraph in doc.paragraphs:
324
+ for run in paragraph.runs:
325
+ original = run.text
326
+ for special, regular in replacements.items():
327
+ run.text = run.text.replace(special, regular)
328
+ if run.text != original:
329
+ count += 1
330
+ except Exception:
331
+ pass
332
+ return count
333
+
334
+ def _simplify_quotes_and_dashes(self, doc) -> int:
335
+ """Convert typographic quotes and dashes to simple ASCII equivalents (OPTIONAL)"""
336
+ count = 0
337
+ replacements = {
338
+ '\u2018': "'", # Left single quotation mark → straight apostrophe
339
+ '\u2019': "'", # Right single quotation mark → straight apostrophe
340
+ '\u201C': '"', # Left double quotation mark → straight quote
341
+ '\u201D': '"', # Right double quotation mark → straight quote
342
+ '\u2013': '-', # En dash → hyphen
343
+ '\u2014': '-', # Em dash → hyphen (NOT double hyphen)
344
+ }
345
+
346
+ try:
347
+ for paragraph in doc.paragraphs:
348
+ for run in paragraph.runs:
349
+ original = run.text
350
+ for special, regular in replacements.items():
351
+ run.text = run.text.replace(special, regular)
352
+ if run.text != original:
353
+ count += 1
354
+ except Exception:
355
+ pass
356
+ return count
357
+
358
+ # ============================================================================
359
+ # UNBREAKER FUNCTIONALITY - Fix incorrect line/paragraph breaks
360
+ # ============================================================================
361
+
362
+ def _fix_incorrect_line_breaks(self, doc) -> int:
363
+ """
364
+ Fix incorrect line breaks (manual line breaks within sentences).
365
+ Detects line breaks that occur mid-sentence and removes them.
366
+ """
367
+ count = 0
368
+ try:
369
+ for paragraph in doc.paragraphs:
370
+ # Check for line breaks within the paragraph text
371
+ original_text = paragraph.text
372
+
373
+ # Line break character in Word is '\v' or '\x0B'
374
+ if '\v' in original_text or '\x0B' in original_text:
375
+ # Check if these are likely incorrect (mid-sentence)
376
+ if self._is_likely_incorrect_break(original_text):
377
+ # Remove line breaks and replace with space
378
+ new_text = original_text.replace('\v', ' ').replace('\x0B', ' ')
379
+ # Clean up multiple spaces
380
+ new_text = re.sub(r'\s+', ' ', new_text)
381
+
382
+ # Update paragraph text
383
+ if paragraph.runs:
384
+ paragraph.runs[0].text = new_text
385
+ # Clear other runs
386
+ for i in range(len(paragraph.runs) - 1, 0, -1):
387
+ paragraph.runs[i].text = ''
388
+ count += 1
389
+ except Exception as e:
390
+ self.logger.error(f"Error fixing line breaks: {e}")
391
+ return count
392
+
393
+ def _join_broken_sentences(self, doc) -> int:
394
+ """
395
+ Join sentences that were incorrectly split across paragraphs.
396
+ Detects paragraphs that don't end with sentence-ending punctuation
397
+ and joins them with the next paragraph.
398
+
399
+ DISABLED BY DEFAULT - This operation is too aggressive and causes
400
+ words to stick together. Needs more sophisticated logic to detect
401
+ true broken sentences vs intentional paragraph breaks.
402
+ """
403
+ count = 0
404
+ # TEMPORARILY DISABLED due to word spacing bugs
405
+ # The current logic joins too many paragraphs incorrectly
406
+ return count
407
+
408
+ # Original code kept for reference but not executed:
409
+ # try:
410
+ # paragraphs = list(doc.paragraphs)
411
+ # i = 0
412
+ #
413
+ # while i < len(paragraphs) - 1:
414
+ # current_para = paragraphs[i]
415
+ # next_para = paragraphs[i + 1]
416
+ #
417
+ # current_text = current_para.text.strip()
418
+ # next_text = next_para.text.strip()
419
+ #
420
+ # # Skip empty paragraphs
421
+ # if not current_text or not next_text:
422
+ # i += 1
423
+ # continue
424
+ #
425
+ # # Check if current paragraph ends mid-sentence
426
+ # if self._is_broken_sentence(current_text):
427
+ # # Join paragraphs WITH PROPER SPACING
428
+ # joined_text = current_text + ' ' + next_text
429
+ #
430
+ # # Update current paragraph
431
+ # if current_para.runs:
432
+ # current_para.runs[0].text = joined_text
433
+ # # Clear other runs
434
+ # for j in range(len(current_para.runs) - 1, 0, -1):
435
+ # current_para.runs[j].text = ''
436
+ #
437
+ # # Clear next paragraph
438
+ # if next_para.runs:
439
+ # for run in next_para.runs:
440
+ # run.text = ''
441
+ #
442
+ # count += 1
443
+ #
444
+ # i += 1
445
+ #
446
+ # except Exception as e:
447
+ # self.logger.error(f"Error joining broken sentences: {e}")
448
+ # return count
449
+
450
+ def _is_likely_incorrect_break(self, text: str) -> bool:
451
+ """Check if a line break is likely incorrect (mid-sentence)"""
452
+ # Line breaks before lowercase letters are often incorrect
453
+ if re.search(r'\v[a-z]', text) or re.search(r'\x0B[a-z]', text):
454
+ return True
455
+ # Line breaks not followed by capital letters or numbers
456
+ if re.search(r'\v[^A-Z0-9\s]', text) or re.search(r'\x0B[^A-Z0-9\s]', text):
457
+ return True
458
+ return False
459
+
460
+ def _is_broken_sentence(self, text: str) -> bool:
461
+ """Check if text appears to be a broken sentence (doesn't end properly)"""
462
+ # Ends with sentence-ending punctuation
463
+ sentence_enders = ('.', '!', '?', ':', ';')
464
+
465
+ # Skip if ends with sentence-ending punctuation
466
+ if text.endswith(sentence_enders):
467
+ return False
468
+
469
+ # Likely broken if ends with lowercase letter
470
+ if text and text[-1].islower():
471
+ return True
472
+
473
+ # Likely broken if ends with comma
474
+ if text.endswith(','):
475
+ return True
476
+
477
+ # Likely broken if very short (less than 50 chars)
478
+ if len(text) < 50:
479
+ return True
480
+
481
+ return False
482
+
483
+ # ============================================================================
484
+ # REMOVE EXCESSIVE SPACES FUNCTIONALITY
485
+ # ============================================================================
486
+
487
+ def _remove_excessive_spaces(self, doc) -> int:
488
+ """
489
+ Remove excessive spaces between words and around punctuation.
490
+
491
+ CRITICAL: We work on full paragraph text, not individual runs,
492
+ because runs are formatting boundaries and may split words.
493
+ Removing trailing spaces from runs causes words to stick together!
494
+ """
495
+ count = 0
496
+ try:
497
+ for paragraph in doc.paragraphs:
498
+ original_text = paragraph.text
499
+
500
+ # Only process if there's text
501
+ if not original_text or not original_text.strip():
502
+ continue
503
+
504
+ # Work on the full paragraph text
505
+ text = original_text
506
+
507
+ # Replace multiple spaces (2+) with single space
508
+ text = re.sub(r' +', ' ', text)
509
+
510
+ # Remove spaces before punctuation (but be careful with abbreviations)
511
+ text = re.sub(r' +([,;:!?)])', r'\1', text)
512
+
513
+ # Remove spaces after opening punctuation
514
+ text = re.sub(r'([(]) +', r'\1', text)
515
+
516
+ # Remove leading/trailing spaces from paragraph
517
+ text = text.strip()
518
+
519
+ # Only update if changed
520
+ if text != original_text:
521
+ # Reconstruct paragraph with cleaned text
522
+ # Keep the first run and put all text there, clear others
523
+ if paragraph.runs:
524
+ paragraph.runs[0].text = text
525
+ # Clear remaining runs
526
+ for i in range(len(paragraph.runs) - 1, 0, -1):
527
+ paragraph.runs[i].text = ''
528
+ count += 1
529
+
530
+ except Exception as e:
531
+ self.logger.error(f"Error removing excessive spaces: {e}")
532
+ return count
533
+
534
+
535
+ def clean_document_simple(input_path: str, output_path: str = None,
536
+ quick_clean: bool = True) -> Dict[str, Any]:
537
+ """
538
+ Convenience function for quick document cleaning with default settings
539
+
540
+ Args:
541
+ input_path: Path to input DOCX file
542
+ output_path: Path to save cleaned file (if None, overwrites input)
543
+ quick_clean: If True, applies common cleaning operations
544
+
545
+ Returns:
546
+ Statistics dictionary
547
+ """
548
+ if output_path is None:
549
+ output_path = input_path
550
+
551
+ cleaner = DocumentCleaner()
552
+
553
+ # Default quick clean operations (most useful for OCR/PDF cleanup)
554
+ operations = {
555
+ # Document Cleaner operations
556
+ 'remove_text_shading': quick_clean,
557
+ 'remove_highlighting': quick_clean,
558
+ 'font_color_to_automatic': quick_clean,
559
+ 'normalize_font_color': quick_clean,
560
+ 'normalize_font_size': quick_clean,
561
+ 'normalize_font': quick_clean,
562
+ 'set_default_spacing': quick_clean,
563
+ 'remove_manual_hyphens': quick_clean,
564
+ 'replace_special_symbols': quick_clean, # Only non-breaking spaces and ellipsis
565
+ 'simplify_quotes_and_dashes': False, # OPTIONAL - converts curly quotes/em-dashes to ASCII
566
+ 'remove_character_styles': False, # More aggressive, optional
567
+
568
+ # Unbreaker operations
569
+ 'fix_line_breaks': quick_clean,
570
+ 'join_broken_sentences': False, # DISABLED - too aggressive, causes word spacing issues
571
+
572
+ # Remove excessive spaces
573
+ 'remove_excessive_spaces': quick_clean,
574
+
575
+ # Not yet implemented
576
+ 'accept_tracked_changes': False,
577
+ }
578
+
579
+ return cleaner.clean_document(input_path, output_path, operations)
580
+
581
+
582
+ if __name__ == "__main__":
583
+ # Example usage
584
+ import sys
585
+
586
+ if len(sys.argv) < 2:
587
+ print("Usage: python document_cleaner.py input.docx [output.docx]")
588
+ sys.exit(1)
589
+
590
+ input_file = sys.argv[1]
591
+ output_file = sys.argv[2] if len(sys.argv) > 2 else input_file.replace('.docx', '_cleaned.docx')
592
+
593
+ print(f"Cleaning document: {input_file}")
594
+ stats = clean_document_simple(input_file, output_file)
595
+
596
+ print(f"\nCleaning complete!")
597
+ print(f" Paragraphs processed: {stats['paragraphs_processed']}")
598
+ print(f" Runs processed: {stats['runs_processed']}")
599
+ print(f" Changes made: {stats['changes_made']}")
600
+ print(f" Output saved to: {output_file}")