supervertaler 1.9.163__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. Supervertaler.py +48473 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1911 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +351 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1176 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.163.dist-info/METADATA +906 -0
  81. supervertaler-1.9.163.dist-info/RECORD +85 -0
  82. supervertaler-1.9.163.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.163.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.163.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.163.dist-info/top_level.txt +2 -0
@@ -0,0 +1,433 @@
1
+ """
2
+ Termbase Manager Module
3
+
4
+ Handles termbase/termbase management for Supervertaler:
5
+ - Create/delete glossaries
6
+ - Add/edit/delete terms
7
+ - Activate/deactivate for projects
8
+ - Import/export glossaries
9
+ - Search across termbases
10
+
11
+ Unified management for both global and project-specific termbases.
12
+ """
13
+
14
+ from typing import List, Dict, Optional, Tuple
15
+ from dataclasses import dataclass
16
+ from datetime import datetime
17
+
18
+
19
+ @dataclass
20
+ class TermbaseInfo:
21
+ """Information about a termbase/termbase"""
22
+ id: int
23
+ name: str
24
+ description: str
25
+ source_lang: Optional[str]
26
+ target_lang: Optional[str]
27
+ project_id: Optional[int] # None = global, set = project-specific
28
+ created_date: str
29
+ modified_date: str
30
+ entry_count: int
31
+ is_active_for_project: bool = False
32
+
33
+
34
+ @dataclass
35
+ class TermbaseEntry:
36
+ """A single term entry in a termbase"""
37
+ id: int
38
+ termbase_id: int
39
+ source_term: str
40
+ target_term: str
41
+ priority: int # 1-99, lower = higher priority
42
+ domain: str
43
+ definition: str
44
+ forbidden: bool
45
+ non_translatable: bool
46
+ created_date: str
47
+ modified_date: str
48
+
49
+
50
+ class TermbaseManager:
51
+ """Manages glossaries and termbases"""
52
+
53
+ def __init__(self, db_manager, log_callback=None):
54
+ """
55
+ Initialize termbase manager
56
+
57
+ Args:
58
+ db_manager: DatabaseManager instance
59
+ log_callback: Optional logging function
60
+ """
61
+ self.db = db_manager
62
+ self.log = log_callback if log_callback else print
63
+
64
+ def create_termbase(
65
+ self,
66
+ name: str,
67
+ description: str = "",
68
+ source_lang: Optional[str] = None,
69
+ target_lang: Optional[str] = None,
70
+ project_id: Optional[int] = None
71
+ ) -> int:
72
+ """
73
+ Create a new termbase
74
+
75
+ Args:
76
+ name: termbase name
77
+ description: Optional description
78
+ source_lang: Source language code (e.g., 'NL', 'EN')
79
+ target_lang: Target language code
80
+ project_id: Optional project ID (None = global termbase)
81
+
82
+ Returns:
83
+ termbase ID
84
+ """
85
+ try:
86
+ cursor = self.db.cursor
87
+ now = datetime.now().isoformat()
88
+
89
+ cursor.execute("""
90
+ INSERT INTO glossaries (name, description, source_lang, target_lang, project_id, created_date, modified_date)
91
+ VALUES (?, ?, ?, ?, ?, ?, ?)
92
+ """, (name, description, source_lang, target_lang, project_id, now, now))
93
+
94
+ self.db.connection.commit()
95
+ termbase_id = cursor.lastrowid
96
+ self.log(f"Created termbase '{name}' (ID: {termbase_id})")
97
+ return termbase_id
98
+ except Exception as e:
99
+ self.log(f"Error creating termbase: {e}")
100
+ raise
101
+
102
+ def get_all_termbases(self) -> List[GlossaryInfo]:
103
+ """Get all glossaries (global and project-specific)"""
104
+ try:
105
+ cursor = self.db.cursor
106
+ cursor.execute("""
107
+ SELECT
108
+ g.id, g.name, g.description, g.source_lang, g.target_lang,
109
+ g.project_id, g.created_date, g.modified_date,
110
+ COUNT(gt.id) as entry_count
111
+ FROM glossaries g
112
+ LEFT JOIN termbase_terms gt ON g.id = gt.termbase_id
113
+ GROUP BY g.id
114
+ ORDER BY g.name
115
+ """)
116
+
117
+ results = cursor.fetchall()
118
+ glossaries = []
119
+ for row in results:
120
+ glossaries.append(GlossaryInfo(
121
+ id=row[0],
122
+ name=row[1],
123
+ description=row[2],
124
+ source_lang=row[3],
125
+ target_lang=row[4],
126
+ project_id=row[5],
127
+ created_date=row[6],
128
+ modified_date=row[7],
129
+ entry_count=row[8] or 0
130
+ ))
131
+ return glossaries
132
+ except Exception as e:
133
+ self.log(f"Error fetching glossaries: {e}")
134
+ return []
135
+
136
+ def get_termbase_terms(self, termbase_id: int) -> List[TermEntry]:
137
+ """Get all terms in a termbase"""
138
+ try:
139
+ cursor = self.db.cursor
140
+ cursor.execute("""
141
+ SELECT id, termbase_id, source_term, target_term, priority,
142
+ domain, definition, forbidden, non_translatable, created_date, modified_date
143
+ FROM termbase_terms
144
+ WHERE termbase_id = ?
145
+ ORDER BY priority ASC, source_term ASC
146
+ """, (termbase_id,))
147
+
148
+ results = cursor.fetchall()
149
+ terms = []
150
+ for row in results:
151
+ terms.append(TermEntry(
152
+ id=row[0],
153
+ termbase_id=row[1],
154
+ source_term=row[2],
155
+ target_term=row[3],
156
+ priority=row[4],
157
+ domain=row[5],
158
+ definition=row[6],
159
+ forbidden=bool(row[7]),
160
+ non_translatable=bool(row[8]),
161
+ created_date=row[9],
162
+ modified_date=row[10]
163
+ ))
164
+ return terms
165
+ except Exception as e:
166
+ self.log(f"Error fetching terms for termbase {termbase_id}: {e}")
167
+ return []
168
+
169
+ def add_term(
170
+ self,
171
+ termbase_id: int,
172
+ source_term: str,
173
+ target_term: str,
174
+ priority: int = 50,
175
+ domain: str = "",
176
+ definition: str = "",
177
+ forbidden: bool = False,
178
+ non_translatable: bool = False
179
+ ) -> int:
180
+ """
181
+ Add a term to a termbase
182
+
183
+ Args:
184
+ termbase_id: Target termbase ID
185
+ source_term: Source language term
186
+ target_term: Target language term
187
+ priority: Priority ranking (1-99, lower = higher)
188
+ domain: Domain/subject area
189
+ definition: Definition or note
190
+ forbidden: Whether term is forbidden for translation
191
+ non_translatable: Whether term should not be translated
192
+
193
+ Returns:
194
+ Term ID
195
+ """
196
+ try:
197
+ cursor = self.db.cursor
198
+ now = datetime.now().isoformat()
199
+
200
+ cursor.execute("""
201
+ INSERT INTO termbase_terms
202
+ (termbase_id, source_term, target_term, priority, domain, definition,
203
+ forbidden, non_translatable, source_lang, target_lang, created_date, modified_date)
204
+ SELECT ?, ?, ?, ?, ?, ?, ?, ?, source_lang, target_lang, ?, ?
205
+ FROM glossaries
206
+ WHERE id = ?
207
+ """, (termbase_id, source_term, target_term, priority, domain, definition,
208
+ forbidden, non_translatable, now, now, termbase_id))
209
+
210
+ self.db.connection.commit()
211
+ term_id = cursor.lastrowid
212
+ self.log(f"Added term '{source_term}' to termbase {termbase_id}")
213
+ return term_id
214
+ except Exception as e:
215
+ self.log(f"Error adding term: {e}")
216
+ raise
217
+
218
+ def update_term(
219
+ self,
220
+ term_id: int,
221
+ source_term: str = None,
222
+ target_term: str = None,
223
+ priority: int = None,
224
+ domain: str = None,
225
+ definition: str = None,
226
+ forbidden: bool = None,
227
+ non_translatable: bool = None
228
+ ) -> bool:
229
+ """Update a term in a termbase"""
230
+ try:
231
+ cursor = self.db.cursor
232
+ now = datetime.now().isoformat()
233
+
234
+ # Build dynamic update query
235
+ updates = ["modified_date = ?"]
236
+ params = [now]
237
+
238
+ if source_term is not None:
239
+ updates.append("source_term = ?")
240
+ params.append(source_term)
241
+ if target_term is not None:
242
+ updates.append("target_term = ?")
243
+ params.append(target_term)
244
+ if priority is not None:
245
+ updates.append("priority = ?")
246
+ params.append(priority)
247
+ if domain is not None:
248
+ updates.append("domain = ?")
249
+ params.append(domain)
250
+ if definition is not None:
251
+ updates.append("definition = ?")
252
+ params.append(definition)
253
+ if forbidden is not None:
254
+ updates.append("forbidden = ?")
255
+ params.append(forbidden)
256
+ if non_translatable is not None:
257
+ updates.append("non_translatable = ?")
258
+ params.append(non_translatable)
259
+
260
+ params.append(term_id)
261
+ query = f"UPDATE termbase_terms SET {', '.join(updates)} WHERE id = ?"
262
+
263
+ cursor.execute(query, params)
264
+ self.db.connection.commit()
265
+ return cursor.rowcount > 0
266
+ except Exception as e:
267
+ self.log(f"Error updating term {term_id}: {e}")
268
+ return False
269
+
270
+ def delete_term(self, term_id: int) -> bool:
271
+ """Delete a term from a termbase"""
272
+ try:
273
+ cursor = self.db.cursor
274
+ cursor.execute("DELETE FROM termbase_terms WHERE id = ?", (term_id,))
275
+ self.db.connection.commit()
276
+ return cursor.rowcount > 0
277
+ except Exception as e:
278
+ self.log(f"Error deleting term {term_id}: {e}")
279
+ return False
280
+
281
+ def delete_termbase(self, termbase_id: int) -> bool:
282
+ """Delete a termbase and all its terms"""
283
+ try:
284
+ cursor = self.db.cursor
285
+ # Delete terms first
286
+ cursor.execute("DELETE FROM termbase_terms WHERE termbase_id = ?", (termbase_id,))
287
+ # Delete termbase
288
+ cursor.execute("DELETE FROM glossaries WHERE id = ?", (termbase_id,))
289
+ self.db.connection.commit()
290
+ self.log(f"Deleted termbase {termbase_id}")
291
+ return cursor.rowcount > 0
292
+ except Exception as e:
293
+ self.log(f"Error deleting termbase {termbase_id}: {e}")
294
+ return False
295
+
296
+ def activate_for_project(self, termbase_id: int, project_id: int) -> bool:
297
+ """Mark a termbase as active for a specific project"""
298
+ try:
299
+ cursor = self.db.cursor
300
+ cursor.execute("""
301
+ INSERT OR REPLACE INTO termbase_project_activation (termbase_id, project_id, activated_date)
302
+ VALUES (?, ?, datetime('now'))
303
+ """, (termbase_id, project_id))
304
+ self.db.connection.commit()
305
+ return True
306
+ except Exception as e:
307
+ self.log(f"Error activating termbase: {e}")
308
+ return False
309
+
310
+ def deactivate_for_project(self, termbase_id: int, project_id: int) -> bool:
311
+ """Mark a termbase as inactive for a specific project"""
312
+ try:
313
+ cursor = self.db.cursor
314
+ cursor.execute("""
315
+ DELETE FROM termbase_project_activation
316
+ WHERE termbase_id = ? AND project_id = ?
317
+ """, (termbase_id, project_id))
318
+ self.db.connection.commit()
319
+ return True
320
+ except Exception as e:
321
+ self.log(f"Error deactivating termbase: {e}")
322
+ return False
323
+
324
+ def is_active_for_project(self, termbase_id: int, project_id: int) -> bool:
325
+ """Check if termbase is active for a project"""
326
+ try:
327
+ cursor = self.db.cursor
328
+ cursor.execute("""
329
+ SELECT 1 FROM termbase_project_activation
330
+ WHERE termbase_id = ? AND project_id = ?
331
+ """, (termbase_id, project_id))
332
+ return cursor.fetchone() is not None
333
+ except Exception as e:
334
+ self.log(f"Error checking activation status: {e}")
335
+ return False
336
+
337
+ def get_active_glossaries_for_project(self, project_id: int) -> List[GlossaryInfo]:
338
+ """Get all glossaries active for a specific project (global + project-specific)"""
339
+ try:
340
+ cursor = self.db.cursor
341
+ # Get global glossaries (project_id IS NULL) that are activated
342
+ # Plus project-specific glossaries (project_id = target_project)
343
+ cursor.execute("""
344
+ SELECT DISTINCT
345
+ g.id, g.name, g.description, g.source_lang, g.target_lang,
346
+ g.project_id, g.created_date, g.modified_date,
347
+ COUNT(gt.id) as entry_count
348
+ FROM glossaries g
349
+ LEFT JOIN termbase_terms gt ON g.id = gt.termbase_id
350
+ WHERE (g.project_id = ? OR
351
+ (g.project_id IS NULL AND g.id IN
352
+ (SELECT termbase_id FROM termbase_project_activation WHERE project_id = ?)))
353
+ GROUP BY g.id
354
+ ORDER BY g.name
355
+ """, (project_id, project_id))
356
+
357
+ results = cursor.fetchall()
358
+ glossaries = []
359
+ for row in results:
360
+ glossaries.append(GlossaryInfo(
361
+ id=row[0],
362
+ name=row[1],
363
+ description=row[2],
364
+ source_lang=row[3],
365
+ target_lang=row[4],
366
+ project_id=row[5],
367
+ created_date=row[6],
368
+ modified_date=row[7],
369
+ entry_count=row[8] or 0,
370
+ is_active_for_project=True
371
+ ))
372
+ return glossaries
373
+ except Exception as e:
374
+ self.log(f"Error fetching active glossaries: {e}")
375
+ return []
376
+
377
+ def export_glossary_to_csv(self, termbase_id: int, filepath: str) -> bool:
378
+ """Export termbase to CSV format"""
379
+ try:
380
+ import csv
381
+ terms = self.get_termbase_terms(termbase_id)
382
+
383
+ with open(filepath, 'w', newline='', encoding='utf-8') as f:
384
+ writer = csv.writer(f)
385
+ writer.writerow(['Source Term', 'Target Term', 'Domain', 'Definition', 'Priority', 'Forbidden', 'Non-Translatable'])
386
+
387
+ for term in terms:
388
+ writer.writerow([
389
+ term.source_term,
390
+ term.target_term,
391
+ term.domain,
392
+ term.definition,
393
+ term.priority,
394
+ 'Yes' if term.forbidden else 'No',
395
+ 'Yes' if term.non_translatable else 'No'
396
+ ])
397
+
398
+ self.log(f"Exported termbase {termbase_id} to {filepath}")
399
+ return True
400
+ except Exception as e:
401
+ self.log(f"Error exporting termbase: {e}")
402
+ return False
403
+
404
+ def import_glossary_from_csv(self, termbase_id: int, filepath: str) -> int:
405
+ """Import terms into termbase from CSV file"""
406
+ try:
407
+ import csv
408
+ count = 0
409
+
410
+ with open(filepath, 'r', encoding='utf-8') as f:
411
+ reader = csv.DictReader(f)
412
+ for row in reader:
413
+ priority = int(row.get('Priority', 50))
414
+ forbidden = row.get('Forbidden', 'No').lower() == 'yes'
415
+ non_translatable = row.get('Non-Translatable', 'No').lower() == 'yes'
416
+
417
+ self.add_term(
418
+ termbase_id,
419
+ row['Source Term'],
420
+ row['Target Term'],
421
+ priority=priority,
422
+ domain=row.get('Domain', ''),
423
+ definition=row.get('Definition', ''),
424
+ forbidden=forbidden,
425
+ non_translatable=non_translatable
426
+ )
427
+ count += 1
428
+
429
+ self.log(f"Imported {count} terms into termbase {termbase_id}")
430
+ return count
431
+ except Exception as e:
432
+ self.log(f"Error importing termbase: {e}")
433
+ return 0
@@ -0,0 +1,188 @@
1
+ """
2
+ ═══════════════════════════════════════════════════════════════════════════════
3
+ Image Extractor Module for Supervertaler
4
+ ═══════════════════════════════════════════════════════════════════════════════
5
+
6
+ Purpose:
7
+ Extract images from DOCX files and save them as sequentially numbered PNG files.
8
+ Integrated into the Reference Images tab under Translation Resources.
9
+
10
+ Features:
11
+ - Extract all images from DOCX documents
12
+ - Save as PNG files with sequential naming (Fig. 1.png, Fig. 2.png, etc.)
13
+ - Support for various image formats embedded in DOCX
14
+ - Progress feedback during extraction
15
+ - Can be used as standalone tool or within Translation Resources workflow
16
+
17
+ Author: Supervertaler Development Team
18
+ Created: 2025-11-17
19
+ Last Modified: 2025-11-17
20
+
21
+ ═══════════════════════════════════════════════════════════════════════════════
22
+ """
23
+
24
+ import os
25
+ from pathlib import Path
26
+ from typing import List, Tuple, Optional
27
+ from zipfile import ZipFile
28
+ from io import BytesIO
29
+ from PIL import Image
30
+
31
+
32
+ class ImageExtractor:
33
+ """Extract images from DOCX files and save as PNG"""
34
+
35
+ def __init__(self):
36
+ self.supported_formats = ['.docx']
37
+
38
+ def extract_images_from_docx(self, docx_path: str, output_dir: str,
39
+ prefix: str = "Fig.") -> Tuple[int, List[str]]:
40
+ """
41
+ Extract all images from a DOCX file and save as PNG files.
42
+
43
+ Args:
44
+ docx_path: Path to the DOCX file
45
+ output_dir: Directory where images will be saved
46
+ prefix: Prefix for output filenames (default: "Fig.")
47
+
48
+ Returns:
49
+ Tuple of (number of images extracted, list of output file paths)
50
+ """
51
+ # Validate input
52
+ if not os.path.exists(docx_path):
53
+ raise FileNotFoundError(f"DOCX file not found: {docx_path}")
54
+
55
+ if not docx_path.lower().endswith('.docx'):
56
+ raise ValueError("File must be a DOCX document")
57
+
58
+ # Create output directory if it doesn't exist
59
+ os.makedirs(output_dir, exist_ok=True)
60
+
61
+ extracted_files = []
62
+ image_count = 0
63
+
64
+ try:
65
+ # DOCX files are ZIP archives
66
+ with ZipFile(docx_path, 'r') as zip_ref:
67
+ # Images are typically in word/media/ folder
68
+ image_files = [f for f in zip_ref.namelist()
69
+ if f.startswith('word/media/')]
70
+
71
+ for img_file in image_files:
72
+ image_count += 1
73
+
74
+ # Read image data
75
+ img_data = zip_ref.read(img_file)
76
+
77
+ # Open with PIL to convert to PNG
78
+ try:
79
+ img = Image.open(BytesIO(img_data))
80
+
81
+ # Convert RGBA to RGB if necessary (for JPEG compatibility)
82
+ if img.mode in ('RGBA', 'LA', 'P'):
83
+ # Create white background
84
+ background = Image.new('RGB', img.size, (255, 255, 255))
85
+ if img.mode == 'P':
86
+ img = img.convert('RGBA')
87
+ background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
88
+ img = background
89
+ elif img.mode != 'RGB':
90
+ img = img.convert('RGB')
91
+
92
+ # Generate output filename
93
+ output_filename = f"{prefix} {image_count}.png"
94
+ output_path = os.path.join(output_dir, output_filename)
95
+
96
+ # Save as PNG
97
+ img.save(output_path, 'PNG', optimize=True)
98
+ extracted_files.append(output_path)
99
+
100
+ except Exception as e:
101
+ print(f"Warning: Could not process image {img_file}: {e}")
102
+ continue
103
+
104
+ except Exception as e:
105
+ raise Exception(f"Error extracting images: {e}")
106
+
107
+ return image_count, extracted_files
108
+
109
+ def extract_from_multiple_docx(self, docx_paths: List[str], output_dir: str,
110
+ prefix: str = "Fig.") -> Tuple[int, List[str]]:
111
+ """
112
+ Extract images from multiple DOCX files.
113
+
114
+ Args:
115
+ docx_paths: List of paths to DOCX files
116
+ output_dir: Directory where images will be saved
117
+ prefix: Prefix for output filenames (default: "Fig.")
118
+
119
+ Returns:
120
+ Tuple of (total number of images extracted, list of output file paths)
121
+ """
122
+ all_extracted_files = []
123
+ total_count = 0
124
+ current_number = 1
125
+
126
+ # Create output directory if it doesn't exist
127
+ os.makedirs(output_dir, exist_ok=True)
128
+
129
+ for docx_path in docx_paths:
130
+ try:
131
+ # Extract images with sequential numbering across all files
132
+ with ZipFile(docx_path, 'r') as zip_ref:
133
+ image_files = [f for f in zip_ref.namelist()
134
+ if f.startswith('word/media/')]
135
+
136
+ for img_file in image_files:
137
+ img_data = zip_ref.read(img_file)
138
+
139
+ try:
140
+ img = Image.open(BytesIO(img_data))
141
+
142
+ # Convert to RGB
143
+ if img.mode in ('RGBA', 'LA', 'P'):
144
+ background = Image.new('RGB', img.size, (255, 255, 255))
145
+ if img.mode == 'P':
146
+ img = img.convert('RGBA')
147
+ background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
148
+ img = background
149
+ elif img.mode != 'RGB':
150
+ img = img.convert('RGB')
151
+
152
+ # Generate output filename with sequential numbering
153
+ output_filename = f"{prefix} {current_number}.png"
154
+ output_path = os.path.join(output_dir, output_filename)
155
+
156
+ # Save as PNG
157
+ img.save(output_path, 'PNG', optimize=True)
158
+ all_extracted_files.append(output_path)
159
+
160
+ current_number += 1
161
+ total_count += 1
162
+
163
+ except Exception as e:
164
+ print(f"Warning: Could not process image from {docx_path}: {e}")
165
+ continue
166
+
167
+ except Exception as e:
168
+ print(f"Warning: Could not process file {docx_path}: {e}")
169
+ continue
170
+
171
+ return total_count, all_extracted_files
172
+
173
+
174
+ # Standalone usage example
175
+ if __name__ == "__main__":
176
+ extractor = ImageExtractor()
177
+
178
+ # Example usage
179
+ docx_file = "example.docx"
180
+ output_directory = "extracted_images"
181
+
182
+ if os.path.exists(docx_file):
183
+ count, files = extractor.extract_images_from_docx(docx_file, output_directory)
184
+ print(f"Extracted {count} images:")
185
+ for f in files:
186
+ print(f" - {f}")
187
+ else:
188
+ print(f"File not found: {docx_file}")