supervertaler 1.9.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervertaler might be problematic. Click here for more details.

Files changed (85) hide show
  1. Supervertaler.py +47886 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1878 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +333 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1172 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.153.dist-info/METADATA +896 -0
  81. supervertaler-1.9.153.dist-info/RECORD +85 -0
  82. supervertaler-1.9.153.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.153.dist-info/top_level.txt +2 -0
@@ -0,0 +1,909 @@
1
+ """
2
+ PDF Rescue Module
3
+ Embeddable version of the AI-powered OCR tool for extracting text from poorly formatted PDFs
4
+ Uses OpenAI's GPT-4 Vision API
5
+
6
+ This module can be embedded in the main Supervertaler application as a tab.
7
+ """
8
+
9
+ import os
10
+ import base64
11
+ from pathlib import Path
12
+ import tkinter as tk
13
+ from tkinter import filedialog, messagebox, scrolledtext, ttk
14
+ from openai import OpenAI
15
+ from docx import Document
16
+ from docx.shared import Pt
17
+ import fitz # PyMuPDF
18
+ import re
19
+
20
+
21
+ class PDFRescue:
22
+ """
23
+ PDF Rescue feature - extract text from images using AI OCR
24
+ Can be embedded in any tkinter application as a tab or panel
25
+ """
26
+
27
+ def __init__(self, parent_app):
28
+ """
29
+ Initialize PDF Rescue module
30
+
31
+ Args:
32
+ parent_app: Reference to the main application (needs .api_keys attribute)
33
+ """
34
+ self.parent_app = parent_app
35
+ self.client = None
36
+ self.image_files = []
37
+ self.extracted_texts = {}
38
+
39
+ # Initialize OpenAI client
40
+ api_key = None
41
+ if hasattr(parent_app, 'api_keys'):
42
+ api_key = parent_app.api_keys.get('openai')
43
+ elif hasattr(parent_app, 'api_key'):
44
+ api_key = parent_app.api_key
45
+
46
+ if api_key:
47
+ try:
48
+ self.client = OpenAI(api_key=api_key)
49
+ except Exception as e:
50
+ print(f"Failed to initialize OpenAI client: {e}")
51
+
52
+ def log_message(self, message: str):
53
+ """Log a message to the parent app's log if available"""
54
+ if hasattr(self.parent_app, 'log'):
55
+ self.parent_app.log(f"[PDF Rescue] {message}")
56
+ else:
57
+ print(f"[PDF Rescue] {message}")
58
+
59
+ def create_tab(self, parent):
60
+ """
61
+ Create the PDF Rescue tab UI
62
+
63
+ Args:
64
+ parent: The parent widget (notebook tab or frame)
65
+ """
66
+ # Save current state before recreating UI
67
+ saved_files = self.image_files.copy() if hasattr(self, 'image_files') else []
68
+ saved_texts = self.extracted_texts.copy() if hasattr(self, 'extracted_texts') else {}
69
+
70
+ # Header
71
+ header_frame = tk.Frame(parent, bg='#e3f2fd', relief='solid', borderwidth=1)
72
+ header_frame.pack(fill='x', padx=5, pady=5)
73
+
74
+ tk.Label(header_frame, text="🔍 PDF Rescue - AI-Powered OCR",
75
+ font=('Segoe UI', 10, 'bold'), bg='#e3f2fd').pack(side='left', padx=10, pady=5)
76
+
77
+ tk.Label(header_frame, text="Extract text from poorly formatted PDF screenshots",
78
+ font=('Segoe UI', 9), bg='#e3f2fd', fg='#666').pack(side='left', padx=(0, 10), pady=5)
79
+
80
+ # Split view: Files on left, Preview on right
81
+ paned = ttk.PanedWindow(parent, orient='horizontal')
82
+ paned.pack(fill='both', expand=True, padx=5, pady=5)
83
+
84
+ # LEFT: File list
85
+ left_frame = tk.Frame(paned)
86
+ paned.add(left_frame, weight=1)
87
+
88
+ tk.Label(left_frame, text="Images to Process",
89
+ font=('Segoe UI', 9, 'bold')).pack(anchor='w', pady=(0, 5))
90
+
91
+ # File list with scrollbar
92
+ list_container = tk.Frame(left_frame)
93
+ list_container.pack(fill='both', expand=True)
94
+
95
+ scroll = tk.Scrollbar(list_container, orient='vertical')
96
+ scroll.pack(side='right', fill='y')
97
+
98
+ self.file_listbox = tk.Listbox(list_container, yscrollcommand=scroll.set,
99
+ font=('Consolas', 9))
100
+ self.file_listbox.pack(fill='both', expand=True)
101
+ scroll.config(command=self.file_listbox.yview)
102
+ self.file_listbox.bind('<<ListboxSelect>>', self._on_file_select)
103
+
104
+ # Buttons
105
+ btn_frame = tk.Frame(left_frame)
106
+ btn_frame.pack(fill='x', pady=(10, 0))
107
+
108
+ tk.Button(btn_frame, text="📄 PDF", command=self._import_from_pdf,
109
+ bg='#9C27B0', fg='white', font=('Segoe UI', 8, 'bold'),
110
+ padx=8, pady=4).pack(side='left', padx=(0, 3))
111
+
112
+ tk.Button(btn_frame, text="➕ Add Files", command=self._add_files,
113
+ bg='#2196F3', fg='white', font=('Segoe UI', 8, 'bold'),
114
+ padx=8, pady=4).pack(side='left', padx=3)
115
+
116
+ tk.Button(btn_frame, text="📂 Folder", command=self._add_folder,
117
+ bg='#2196F3', fg='white', font=('Segoe UI', 8, 'bold'),
118
+ padx=8, pady=4).pack(side='left', padx=3)
119
+
120
+ tk.Button(btn_frame, text="Clear", command=self._clear_list,
121
+ bg='#9E9E9E', fg='white', font=('Segoe UI', 8),
122
+ padx=8, pady=4).pack(side='left', padx=3)
123
+
124
+ # RIGHT: Text preview
125
+ right_frame = tk.Frame(paned)
126
+ paned.add(right_frame, weight=2)
127
+
128
+ tk.Label(right_frame, text="Extracted Text Preview",
129
+ font=('Segoe UI', 9, 'bold')).pack(anchor='w', pady=(0, 5))
130
+
131
+ self.preview_text = scrolledtext.ScrolledText(right_frame, wrap='word',
132
+ font=('Segoe UI', 9),
133
+ height=15)
134
+ self.preview_text.pack(fill='both', expand=True)
135
+
136
+ # Processing options
137
+ options_frame = tk.LabelFrame(parent, text="Processing Options",
138
+ padx=10, pady=10)
139
+ options_frame.pack(fill='x', padx=5, pady=(0, 10))
140
+
141
+ # Model selection and formatting option
142
+ model_frame = tk.Frame(options_frame)
143
+ model_frame.pack(fill='x', pady=(0, 5))
144
+
145
+ tk.Label(model_frame, text="Model:", font=('Segoe UI', 9)).pack(side='left', padx=(0, 5))
146
+ self.model_var = tk.StringVar(value="gpt-4o")
147
+ models = ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo"]
148
+ ttk.Combobox(model_frame, textvariable=self.model_var, values=models,
149
+ width=20, state='readonly').pack(side='left')
150
+
151
+ # Formatting option
152
+ self.preserve_formatting_var = tk.BooleanVar(value=True)
153
+ tk.Checkbutton(model_frame, text="Preserve formatting (bold/italic/underline)",
154
+ variable=self.preserve_formatting_var,
155
+ font=('Segoe UI', 9)).pack(side='left', padx=(20, 0))
156
+
157
+ # Custom instructions
158
+ instructions_header = tk.Frame(options_frame)
159
+ instructions_header.pack(fill='x', pady=(5, 2))
160
+
161
+ tk.Label(instructions_header, text="Extraction Instructions:",
162
+ font=('Segoe UI', 9)).pack(side='left')
163
+
164
+ tk.Button(instructions_header, text="👁️ Show Prompt",
165
+ command=self._show_full_prompt,
166
+ bg='#9C27B0', fg='white', font=('Segoe UI', 8),
167
+ padx=8, pady=2).pack(side='right')
168
+
169
+ self.instructions_text = scrolledtext.ScrolledText(options_frame, wrap='word',
170
+ font=('Segoe UI', 9),
171
+ height=3)
172
+ self.instructions_text.pack(fill='x')
173
+
174
+ default_instructions = """Extract all text from this image. The image is a screenshot from a poorly formatted PDF.
175
+ Please:
176
+ - Extract all visible text accurately
177
+ - Fix any obvious OCR errors or formatting issues
178
+ - Remove extraneous line breaks within paragraphs
179
+ - Preserve intentional paragraph breaks
180
+ - Maintain the logical flow and structure of the content
181
+ - For redacted/blacked-out text: insert a descriptive placeholder in square brackets in the document's language (e.g., [naam] for Dutch names, [name] for English names, [bedrag] for amounts, etc.)
182
+ - For stamps, signatures, or images: insert a descriptive placeholder in square brackets in the document's language (e.g., [handtekening], [stempel], [signature], [stamp], etc.)
183
+ - For any non-text elements that would normally appear: describe them briefly in square brackets
184
+ - Use markdown for text formatting: **bold text**, *italic text*, __underlined text__
185
+ - Output clean, readable text only (no commentary)"""
186
+
187
+ self.instructions_text.insert('1.0', default_instructions)
188
+
189
+ # Action buttons
190
+ action_frame = tk.Frame(parent, bg='white')
191
+ action_frame.pack(fill='x', padx=5, pady=(0, 10))
192
+
193
+ tk.Button(action_frame, text="🔍 Process Selected",
194
+ command=self._process_selected,
195
+ bg='#FF9800', fg='white', font=('Segoe UI', 9, 'bold'),
196
+ padx=15, pady=6).pack(side='left', padx=(0, 5))
197
+
198
+ tk.Button(action_frame, text="⚡ Process ALL",
199
+ command=self._process_all,
200
+ bg='#4CAF50', fg='white', font=('Segoe UI', 9, 'bold'),
201
+ padx=15, pady=6).pack(side='left', padx=5)
202
+
203
+ tk.Button(action_frame, text="💾 Save DOCX",
204
+ command=self._save_to_docx,
205
+ bg='#2196F3', fg='white', font=('Segoe UI', 9, 'bold'),
206
+ padx=15, pady=6).pack(side='left', padx=5)
207
+
208
+ tk.Button(action_frame, text="📋 Copy All",
209
+ command=self._copy_all_text,
210
+ bg='#607D8B', fg='white', font=('Segoe UI', 9, 'bold'),
211
+ padx=15, pady=6).pack(side='left', padx=5)
212
+
213
+ tk.Button(action_frame, text="📊 Session Report",
214
+ command=self._save_session_report,
215
+ bg='#795548', fg='white', font=('Segoe UI', 9, 'bold'),
216
+ padx=15, pady=6).pack(side='left', padx=5)
217
+
218
+ # Status
219
+ self.status_label = tk.Label(parent, text="Ready - Add images to begin",
220
+ font=('Segoe UI', 9), fg='#666', bg='white')
221
+ self.status_label.pack(pady=(0, 5))
222
+
223
+ # Progress bar
224
+ self.progress = ttk.Progressbar(parent, mode='determinate')
225
+ self.progress.pack(fill='x', padx=5, pady=(0, 5))
226
+
227
+ # Restore state after UI recreation
228
+ self.image_files = saved_files
229
+ self.extracted_texts = saved_texts
230
+ if self.image_files:
231
+ self._update_listbox()
232
+
233
+ # === File Management Methods ===
234
+
235
+ def _import_from_pdf(self):
236
+ """Import images directly from a PDF file"""
237
+ pdf_file = filedialog.askopenfilename(
238
+ title="Select PDF File",
239
+ filetypes=[
240
+ ("PDF files", "*.pdf"),
241
+ ("All files", "*.*")
242
+ ]
243
+ )
244
+
245
+ if not pdf_file:
246
+ return
247
+
248
+ try:
249
+ # Open PDF
250
+ doc = fitz.open(pdf_file)
251
+ total_pages = len(doc)
252
+
253
+ if total_pages == 0:
254
+ messagebox.showwarning("Empty PDF", "The selected PDF has no pages.")
255
+ return
256
+
257
+ # Create folder for extracted images next to the PDF
258
+ pdf_path = Path(pdf_file)
259
+ pdf_name = pdf_path.stem
260
+ images_folder = pdf_path.parent / f"{pdf_name}_images"
261
+
262
+ # Create folder if it doesn't exist
263
+ images_folder.mkdir(exist_ok=True)
264
+ temp_dir = str(images_folder)
265
+
266
+ # Log start
267
+ if hasattr(self, 'log_message'):
268
+ self.log_message(f"Starting PDF import: {Path(pdf_file).name}")
269
+ self.log_message(f"Total pages: {total_pages}")
270
+
271
+ # Extract each page as an image
272
+ extracted_count = 0
273
+ self.status_label.config(text=f"Extracting pages from PDF...")
274
+ self.parent_app.root.update_idletasks()
275
+
276
+ for page_num in range(total_pages):
277
+ page = doc[page_num]
278
+
279
+ # Render page to pixmap (image) at 2x resolution for better quality
280
+ zoom = 2.0
281
+ mat = fitz.Matrix(zoom, zoom)
282
+ pix = page.get_pixmap(matrix=mat)
283
+
284
+ # Save as PNG
285
+ img_filename = f"{pdf_name}_page_{page_num + 1:03d}.png"
286
+ img_path = os.path.join(temp_dir, img_filename)
287
+ pix.save(img_path)
288
+
289
+ # Add to image list
290
+ if img_path not in self.image_files:
291
+ self.image_files.append(img_path)
292
+ extracted_count += 1
293
+
294
+ # Log each page
295
+ if hasattr(self, 'log_message'):
296
+ self.log_message(f" Page {page_num + 1}/{total_pages} extracted: {img_filename}")
297
+
298
+ # Update progress
299
+ self.status_label.config(
300
+ text=f"Extracting page {page_num + 1}/{total_pages}..."
301
+ )
302
+ self.parent_app.root.update_idletasks()
303
+
304
+ doc.close()
305
+
306
+ # Update UI
307
+ self._update_listbox()
308
+ self.status_label.config(
309
+ text=f"Imported {extracted_count} page(s) from PDF"
310
+ )
311
+
312
+ # Log completion
313
+ if hasattr(self, 'log_message'):
314
+ self.log_message(f"PDF import complete: {extracted_count} pages extracted")
315
+ self.log_message(f"Temporary folder: {temp_dir}")
316
+
317
+ messagebox.showinfo(
318
+ "PDF Import Complete",
319
+ f"Successfully extracted {extracted_count} page(s) from:\n{Path(pdf_file).name}\n\n"
320
+ f"Images saved to folder:\n{temp_dir}\n\n"
321
+ f"These images are kept for your reference and can be useful for the end client.\n\n"
322
+ f"You can now process these pages with AI OCR."
323
+ )
324
+
325
+ except Exception as e:
326
+ messagebox.showerror("PDF Import Error", f"Failed to import PDF:\n{str(e)}")
327
+ self.status_label.config(text="PDF import failed")
328
+
329
+ def _add_files(self):
330
+ """Add individual image files"""
331
+ files = filedialog.askopenfilenames(
332
+ title="Select Image Files",
333
+ filetypes=[
334
+ ("Image files", "*.jpg *.jpeg *.png *.bmp *.gif *.tiff"),
335
+ ("All files", "*.*")
336
+ ]
337
+ )
338
+
339
+ if files:
340
+ for file in files:
341
+ if file not in self.image_files:
342
+ self.image_files.append(file)
343
+ self._update_listbox()
344
+ self.status_label.config(text=f"Added {len(files)} file(s)")
345
+
346
+ def _add_folder(self):
347
+ """Add all images from a folder"""
348
+ folder = filedialog.askdirectory(title="Select Folder with Images")
349
+
350
+ if folder:
351
+ image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}
352
+ files = []
353
+
354
+ for file in sorted(os.listdir(folder)):
355
+ file_path = os.path.join(folder, file)
356
+ if os.path.isfile(file_path):
357
+ ext = os.path.splitext(file)[1].lower()
358
+ if ext in image_extensions and file_path not in self.image_files:
359
+ files.append(file_path)
360
+
361
+ self.image_files.extend(files)
362
+ self._update_listbox()
363
+ self.status_label.config(text=f"Added {len(files)} file(s) from folder")
364
+
365
+ def _clear_list(self):
366
+ """Clear all files"""
367
+ if self.image_files and messagebox.askyesno("Clear", "Remove all files?"):
368
+ self.image_files = []
369
+ self.extracted_texts = {}
370
+ self._update_listbox()
371
+ self.preview_text.delete('1.0', tk.END)
372
+ self.status_label.config(text="List cleared")
373
+
374
+ def _update_listbox(self):
375
+ """Update file listbox"""
376
+ self.file_listbox.delete(0, tk.END)
377
+ for i, file in enumerate(self.image_files, 1):
378
+ filename = os.path.basename(file)
379
+ status = "✓ " if file in self.extracted_texts else ""
380
+ self.file_listbox.insert(tk.END, f"{status}{i:2d}. {filename}")
381
+
382
+ def _on_file_select(self, event):
383
+ """Show extracted text when file is selected"""
384
+ selection = self.file_listbox.curselection()
385
+ if not selection:
386
+ return
387
+
388
+ idx = selection[0]
389
+ if idx < len(self.image_files):
390
+ file = self.image_files[idx]
391
+ if file in self.extracted_texts:
392
+ self.preview_text.delete('1.0', tk.END)
393
+ self.preview_text.insert('1.0', self.extracted_texts[file])
394
+
395
+ def _show_full_prompt(self):
396
+ """Show the exact prompt that will be sent to the AI"""
397
+ instructions = self.instructions_text.get('1.0', 'end-1c').strip()
398
+
399
+ # Apply formatting modifications like in _extract_text_from_image
400
+ if self.preserve_formatting_var.get():
401
+ if "markdown for text formatting" not in instructions:
402
+ instructions += "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__"
403
+ else:
404
+ instructions = instructions.replace(
405
+ "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
406
+ ).replace(
407
+ "- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
408
+ )
409
+
410
+ # Create popup window
411
+ popup = tk.Toplevel()
412
+ popup.title("Full Prompt Preview")
413
+ popup.geometry("700x600")
414
+
415
+ # Main frame
416
+ main_frame = tk.Frame(popup, padx=15, pady=15)
417
+ main_frame.pack(fill='both', expand=True)
418
+
419
+ # Title
420
+ title = tk.Label(main_frame, text="Exact Prompt Sent to OpenAI API",
421
+ font=('Segoe UI', 12, 'bold'))
422
+ title.pack(pady=(0, 10))
423
+
424
+ # Info frame
425
+ info_frame = tk.LabelFrame(main_frame, text="Configuration", padx=10, pady=10)
426
+ info_frame.pack(fill='x', pady=(0, 10))
427
+
428
+ tk.Label(info_frame, text=f"Model: {self.model_var.get()}",
429
+ font=('Segoe UI', 9, 'bold')).pack(anchor='w')
430
+
431
+ formatting_status = "✓ Enabled" if self.preserve_formatting_var.get() else "✗ Disabled"
432
+ tk.Label(info_frame, text=f"Formatting Preservation: {formatting_status}",
433
+ font=('Segoe UI', 9)).pack(anchor='w')
434
+
435
+ tk.Label(info_frame, text=f"Max Tokens: 4000",
436
+ font=('Segoe UI', 9)).pack(anchor='w')
437
+
438
+ # Prompt text
439
+ prompt_frame = tk.LabelFrame(main_frame, text="Full Instructions Text",
440
+ padx=10, pady=10)
441
+ prompt_frame.pack(fill='both', expand=True, pady=(0, 10))
442
+
443
+ prompt_text = scrolledtext.ScrolledText(prompt_frame, wrap='word',
444
+ font=('Consolas', 9))
445
+ prompt_text.pack(fill='both', expand=True)
446
+ prompt_text.insert('1.0', instructions)
447
+ prompt_text.config(state='disabled')
448
+
449
+ # Note
450
+ note = tk.Label(main_frame,
451
+ text="Note: The image is sent as base64-encoded data along with these instructions.",
452
+ font=('Segoe UI', 8), fg='#666')
453
+ note.pack(pady=(0, 5))
454
+
455
+ # Close button
456
+ tk.Button(main_frame, text="Close", command=popup.destroy,
457
+ bg='#607D8B', fg='white', font=('Segoe UI', 9, 'bold'),
458
+ padx=20, pady=6).pack()
459
+
460
+ def _save_session_report(self):
461
+ """Generate and save a session report in markdown format"""
462
+ if not self.extracted_texts:
463
+ messagebox.showwarning("No Data", "No OCR processing has been performed yet.\n\n"
464
+ "Process some images first to generate a session report.")
465
+ return
466
+
467
+ # Ask for save location
468
+ output_file = filedialog.asksaveasfilename(
469
+ title="Save Session Report",
470
+ defaultextension=".md",
471
+ filetypes=[("Markdown files", "*.md"), ("Text files", "*.txt"), ("All files", "*.*")],
472
+ initialfile="PDF_Rescue_SessionReport.md"
473
+ )
474
+
475
+ if not output_file:
476
+ return
477
+
478
+ try:
479
+ from datetime import datetime
480
+
481
+ # Generate report content
482
+ report_lines = []
483
+ report_lines.append("# PDF Rescue - Session Report\n")
484
+ report_lines.append("**Generated by [Supervertaler](https://supervertaler.com/) • by Michael Beijer**\n\n")
485
+ report_lines.append(f"**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
486
+ report_lines.append("---\n\n")
487
+
488
+ # Configuration section
489
+ report_lines.append("## Configuration\n\n")
490
+ report_lines.append(f"- **Model**: {self.model_var.get()}\n")
491
+ formatting_status = "Enabled ✓" if self.preserve_formatting_var.get() else "Disabled ✗"
492
+ report_lines.append(f"- **Formatting Preservation**: {formatting_status}\n")
493
+ report_lines.append(f"- **Total Images Processed**: {len(self.extracted_texts)}\n")
494
+ report_lines.append(f"- **Total Images in List**: {len(self.image_files)}\n\n")
495
+
496
+ # Instructions used
497
+ report_lines.append("## Extraction Instructions\n\n")
498
+ report_lines.append("```\n")
499
+ instructions = self.instructions_text.get('1.0', 'end-1c').strip()
500
+ if self.preserve_formatting_var.get():
501
+ if "markdown for text formatting" not in instructions:
502
+ instructions += "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__"
503
+ report_lines.append(instructions)
504
+ report_lines.append("\n```\n\n")
505
+
506
+ # Processing summary
507
+ report_lines.append("## Processing Summary\n\n")
508
+ report_lines.append("| # | Image File | Status |\n")
509
+ report_lines.append("|---|------------|--------|\n")
510
+
511
+ for i, file in enumerate(self.image_files, 1):
512
+ filename = os.path.basename(file)
513
+ status = "✓ Processed" if file in self.extracted_texts else "⧗ Pending"
514
+ report_lines.append(f"| {i} | {filename} | {status} |\n")
515
+
516
+ report_lines.append("\n---\n\n")
517
+
518
+ # Extracted text for each image
519
+ report_lines.append("## Extracted Text\n\n")
520
+
521
+ for i, file in enumerate(self.image_files, 1):
522
+ if file in self.extracted_texts:
523
+ filename = os.path.basename(file)
524
+ report_lines.append(f"### Page {i}: {filename}\n\n")
525
+ report_lines.append("```\n")
526
+ report_lines.append(self.extracted_texts[file])
527
+ report_lines.append("\n```\n\n")
528
+ report_lines.append("---\n\n")
529
+
530
+ # Statistics
531
+ report_lines.append("## Statistics\n\n")
532
+ total_chars = sum(len(text) for text in self.extracted_texts.values())
533
+ total_words = sum(len(text.split()) for text in self.extracted_texts.values())
534
+ report_lines.append(f"- **Total Characters Extracted**: {total_chars:,}\n")
535
+ report_lines.append(f"- **Total Words Extracted**: {total_words:,}\n")
536
+ report_lines.append(f"- **Average Characters per Page**: {total_chars // len(self.extracted_texts) if self.extracted_texts else 0:,}\n")
537
+ report_lines.append(f"- **Average Words per Page**: {total_words // len(self.extracted_texts) if self.extracted_texts else 0:,}\n\n")
538
+
539
+ # Footer
540
+ report_lines.append("---\n\n")
541
+ report_lines.append("*Report generated by **PDF Rescue** - AI-Powered OCR Tool*\n\n")
542
+ report_lines.append("*Part of [**Supervertaler**](https://supervertaler.com/) • by Michael Beijer*\n")
543
+
544
+ # Write to file
545
+ with open(output_file, 'w', encoding='utf-8') as f:
546
+ f.writelines(report_lines)
547
+
548
+ self.log_message(f"Session report saved: {Path(output_file).name}")
549
+ self.status_label.config(text=f"✓ Report saved to {os.path.basename(output_file)}")
550
+
551
+ if messagebox.askyesno("Success",
552
+ f"Session report saved successfully!\n\n"
553
+ f"File: {Path(output_file).name}\n\n"
554
+ "Open the report now?"):
555
+ os.startfile(output_file)
556
+
557
+ except Exception as e:
558
+ messagebox.showerror("Error", f"Failed to save report:\n\n{str(e)}")
559
+
560
+ # === OCR Processing Methods ===
561
+
562
+ def _encode_image(self, image_path):
563
+ """Encode image to base64"""
564
+ with open(image_path, "rb") as image_file:
565
+ return base64.b64encode(image_file.read()).decode('utf-8')
566
+
567
+ def _extract_text_from_image(self, image_path):
568
+ """Use GPT-4 Vision to extract text from image"""
569
+ if not self.client:
570
+ return "[ERROR: OpenAI client not initialized. Check API key.]"
571
+
572
+ try:
573
+ base64_image = self._encode_image(image_path)
574
+ instructions = self.instructions_text.get('1.0', 'end-1c').strip()
575
+
576
+ # Add or remove formatting instruction based on checkbox
577
+ if self.preserve_formatting_var.get():
578
+ if "markdown for text formatting" not in instructions:
579
+ instructions += "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__"
580
+ else:
581
+ # Remove markdown instruction if present
582
+ instructions = instructions.replace(
583
+ "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
584
+ ).replace(
585
+ "- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
586
+ )
587
+
588
+ response = self.client.chat.completions.create(
589
+ model=self.model_var.get(),
590
+ messages=[
591
+ {
592
+ "role": "user",
593
+ "content": [
594
+ {
595
+ "type": "text",
596
+ "text": instructions
597
+ },
598
+ {
599
+ "type": "image_url",
600
+ "image_url": {
601
+ "url": f"data:image/jpeg;base64,{base64_image}"
602
+ }
603
+ }
604
+ ]
605
+ }
606
+ ],
607
+ max_tokens=4000
608
+ )
609
+
610
+ return response.choices[0].message.content
611
+
612
+ except Exception as e:
613
+ return f"[ERROR extracting text: {str(e)}]"
614
+
615
+ def _process_selected(self):
616
+ """Process currently selected image"""
617
+ selection = self.file_listbox.curselection()
618
+ if not selection:
619
+ messagebox.showwarning("No Selection", "Please select an image to process")
620
+ return
621
+
622
+ idx = selection[0]
623
+ if idx >= len(self.image_files):
624
+ return
625
+
626
+ file = self.image_files[idx]
627
+ filename = os.path.basename(file)
628
+
629
+ self.log_message(f"Processing selected image: {filename}")
630
+ self.status_label.config(text=f"Processing {filename}...")
631
+ if hasattr(self.parent_app, 'root'):
632
+ self.parent_app.root.update()
633
+
634
+ text = self._extract_text_from_image(file)
635
+ self.extracted_texts[file] = text
636
+
637
+ self.preview_text.delete('1.0', tk.END)
638
+ self.preview_text.insert('1.0', text)
639
+
640
+ self._update_listbox()
641
+ self.log_message(f"Successfully processed: {filename}")
642
+ self.status_label.config(text=f"✓ Processed {filename}")
643
+
644
+ def _process_all(self):
645
+ """Process all images in the list"""
646
+ if not self.image_files:
647
+ messagebox.showwarning("No Files", "Please add images first")
648
+ return
649
+
650
+ if not messagebox.askyesno("Process All",
651
+ f"Process all {len(self.image_files)} images?\n\n"
652
+ "This will use API credits and may take several minutes."):
653
+ return
654
+
655
+ self.log_message(f"Starting batch processing: {len(self.image_files)} images")
656
+ self.progress['maximum'] = len(self.image_files)
657
+ self.progress['value'] = 0
658
+
659
+ for i, file in enumerate(self.image_files, 1):
660
+ filename = os.path.basename(file)
661
+ self.status_label.config(text=f"Processing {i}/{len(self.image_files)}: {filename}...")
662
+ if hasattr(self.parent_app, 'root'):
663
+ self.parent_app.root.update()
664
+
665
+ if file not in self.extracted_texts:
666
+ text = self._extract_text_from_image(file)
667
+ self.extracted_texts[file] = text
668
+ self.log_message(f" [{i}/{len(self.image_files)}] Processed: {filename}")
669
+ else:
670
+ self.log_message(f" [{i}/{len(self.image_files)}] Skipped (already processed): {filename}")
671
+
672
+ self.progress['value'] = i
673
+ self._update_listbox()
674
+
675
+ self.log_message(f"Batch processing complete: {len(self.image_files)} images processed")
676
+ self.status_label.config(text=f"✓ Processed all {len(self.image_files)} images!")
677
+ messagebox.showinfo("Complete",
678
+ f"Successfully processed {len(self.image_files)} images!\n\n"
679
+ "Click 'Save DOCX' to export the text.")
680
+
681
+ # === Export Methods ===
682
+
683
+ def _add_formatted_text(self, doc, text):
684
+ """
685
+ Add text to document with markdown formatting parsed
686
+ Supports: **bold**, *italic*, __underline__
687
+ """
688
+ # Split text into paragraphs
689
+ paragraphs = text.split('\n')
690
+
691
+ for para_text in paragraphs:
692
+ if not para_text.strip():
693
+ continue
694
+
695
+ para = doc.add_paragraph()
696
+ para.paragraph_format.line_spacing = 1.15
697
+ para.paragraph_format.space_after = Pt(12)
698
+
699
+ # Parse markdown formatting using regex
700
+ # Pattern matches: **bold**, *italic*, __underline__
701
+ # We need to handle nested/overlapping formatting carefully
702
+ position = 0
703
+
704
+ # Combined pattern to find all formatting markers
705
+ pattern = r'(\*\*.*?\*\*|\*.*?\*|__.*?__|.+?(?=\*\*|\*|__|$)|.)'
706
+
707
+ # Simple approach: process sequentially
708
+ remaining = para_text
709
+
710
+ while remaining:
711
+ # Check for bold (**text**)
712
+ bold_match = re.match(r'\*\*(.*?)\*\*', remaining)
713
+ if bold_match:
714
+ run = para.add_run(bold_match.group(1))
715
+ run.bold = True
716
+ remaining = remaining[bold_match.end():]
717
+ continue
718
+
719
+ # Check for underline (__text__)
720
+ underline_match = re.match(r'__(.*?)__', remaining)
721
+ if underline_match:
722
+ run = para.add_run(underline_match.group(1))
723
+ run.underline = True
724
+ remaining = remaining[underline_match.end():]
725
+ continue
726
+
727
+ # Check for italic (*text*)
728
+ italic_match = re.match(r'\*(.*?)\*', remaining)
729
+ if italic_match:
730
+ run = para.add_run(italic_match.group(1))
731
+ run.italic = True
732
+ remaining = remaining[italic_match.end():]
733
+ continue
734
+
735
+ # No formatting - add plain text until next marker or end
736
+ next_marker = len(remaining)
737
+ for marker in ['**', '*', '__']:
738
+ pos = remaining.find(marker)
739
+ if pos != -1 and pos < next_marker:
740
+ next_marker = pos
741
+
742
+ if next_marker == 0:
743
+ # Edge case: marker at start but no match (e.g., single * or **)
744
+ para.add_run(remaining[0])
745
+ remaining = remaining[1:]
746
+ else:
747
+ plain_text = remaining[:next_marker] if next_marker < len(remaining) else remaining
748
+ if plain_text:
749
+ para.add_run(plain_text)
750
+ remaining = remaining[next_marker:]
751
+
752
+ def _save_to_docx(self):
753
+ """Save all extracted text to a Word document"""
754
+ if not self.extracted_texts:
755
+ messagebox.showwarning("No Text", "No extracted text to save.\n\n"
756
+ "Process images first.")
757
+ return
758
+
759
+ output_file = filedialog.asksaveasfilename(
760
+ title="Save Extracted Text",
761
+ defaultextension=".docx",
762
+ filetypes=[("Word Document", "*.docx"), ("All files", "*.*")],
763
+ initialfile="extracted_text.docx"
764
+ )
765
+
766
+ if not output_file:
767
+ return
768
+
769
+ self.log_message(f"Saving extracted text to DOCX: {Path(output_file).name}")
770
+
771
+ try:
772
+ doc = Document()
773
+
774
+ # Add title
775
+ title = doc.add_heading('Extracted Text from Images', 0)
776
+ title.runs[0].font.size = Pt(16)
777
+
778
+ # Add extracted text in order
779
+ for i, file in enumerate(self.image_files, 1):
780
+ if file in self.extracted_texts:
781
+ # Page header
782
+ heading = doc.add_heading(f'Page {i}: {os.path.basename(file)}', level=2)
783
+ heading.runs[0].font.size = Pt(12)
784
+
785
+ # Text content with formatting
786
+ text = self.extracted_texts[file]
787
+ if self.preserve_formatting_var.get():
788
+ self._add_formatted_text(doc, text)
789
+ else:
790
+ para = doc.add_paragraph(text)
791
+ para.paragraph_format.line_spacing = 1.15
792
+ para.paragraph_format.space_after = Pt(12)
793
+
794
+ # Page break except for last
795
+ if i < len(self.image_files):
796
+ doc.add_page_break()
797
+
798
+ doc.save(output_file)
799
+
800
+ self.log_message(f"Successfully saved {len(self.extracted_texts)} pages to: {Path(output_file).name}")
801
+ self.status_label.config(text=f"✓ Saved to {os.path.basename(output_file)}")
802
+
803
+ if messagebox.askyesno("Success",
804
+ f"Document saved successfully!\n\n"
805
+ f"{len(self.extracted_texts)} pages of text extracted\n\n"
806
+ "Open the document now?"):
807
+ os.startfile(output_file)
808
+
809
+ except Exception as e:
810
+ messagebox.showerror("Error", f"Failed to save document:\n\n{str(e)}")
811
+
812
+ def _copy_all_text(self):
813
+ """Copy all extracted text to clipboard"""
814
+ if not self.extracted_texts:
815
+ messagebox.showwarning("No Text", "No extracted text to copy")
816
+ return
817
+
818
+ all_text = []
819
+ for i, file in enumerate(self.image_files, 1):
820
+ if file in self.extracted_texts:
821
+ all_text.append(f"=== Page {i}: {os.path.basename(file)} ===\n")
822
+ all_text.append(self.extracted_texts[file])
823
+ all_text.append("\n\n")
824
+
825
+ combined = "".join(all_text)
826
+
827
+ # Get root window from parent app or use clipboard differently
828
+ if hasattr(self.parent_app, 'root'):
829
+ self.parent_app.root.clipboard_clear()
830
+ self.parent_app.root.clipboard_append(combined)
831
+
832
+ self.status_label.config(text=f"✓ Copied {len(self.extracted_texts)} pages to clipboard")
833
+ messagebox.showinfo("Copied", f"Copied text from {len(self.extracted_texts)} pages to clipboard!")
834
+
835
+
836
+ # === Standalone Application ===
837
+
838
+ if __name__ == "__main__":
839
+ """Run PDF Rescue as a standalone application"""
840
+
841
+ class StandaloneApp:
842
+ """Minimal parent app for standalone mode"""
843
+ def __init__(self):
844
+ self.root = tk.Tk()
845
+ self.root.title("PDF Rescue - AI-Powered OCR Tool")
846
+ self.root.geometry("1000x700")
847
+
848
+ # Load API key from api_keys.txt
849
+ self.api_keys = {}
850
+ api_file = Path("api_keys.txt")
851
+ if api_file.exists():
852
+ with open(api_file, 'r', encoding='utf-8') as f:
853
+ for line in f:
854
+ line = line.strip()
855
+ if line and not line.startswith('#') and '=' in line:
856
+ key, value = line.split('=', 1)
857
+ if 'openai' in key.lower():
858
+ self.api_keys['openai'] = value.strip()
859
+
860
+ if not self.api_keys.get('openai'):
861
+ messagebox.showerror(
862
+ "API Key Missing",
863
+ "Could not find OpenAI API key in api_keys.txt\n\n"
864
+ "Please add a line like:\nOPENAI_API_KEY=your-key-here"
865
+ )
866
+ self.root.destroy()
867
+ return
868
+
869
+ # Create main container
870
+ main_frame = tk.Frame(self.root)
871
+ main_frame.pack(fill='both', expand=True, padx=10, pady=10)
872
+
873
+ # Add title
874
+ title = tk.Label(main_frame, text="PDF Rescue - AI-Powered OCR Tool",
875
+ font=('Segoe UI', 14, 'bold'))
876
+ title.pack(pady=(0, 10))
877
+
878
+ # Create PDF Rescue instance
879
+ self.pdf_rescue = PDFRescue(self)
880
+ self.pdf_rescue.create_tab(main_frame)
881
+
882
+ # Add log at bottom
883
+ log_frame = tk.LabelFrame(self.root, text="Activity Log", padx=5, pady=5)
884
+ log_frame.pack(fill='x', padx=10, pady=(0, 10))
885
+
886
+ self.log_text = scrolledtext.ScrolledText(log_frame, height=4, wrap='word',
887
+ font=('Consolas', 9))
888
+ self.log_text.pack(fill='both', expand=True)
889
+ self.log_text.config(state='disabled')
890
+
891
+ def log(self, message: str):
892
+ """Add message to log"""
893
+ from datetime import datetime
894
+ timestamp = datetime.now().strftime("%H:%M:%S")
895
+ formatted_message = f"[{timestamp}] {message}\n"
896
+
897
+ self.log_text.config(state='normal')
898
+ self.log_text.insert('end', formatted_message)
899
+ self.log_text.see('end')
900
+ self.log_text.config(state='disabled')
901
+
902
+ def run(self):
903
+ """Start the application"""
904
+ self.root.mainloop()
905
+
906
+ # Create and run standalone app
907
+ app = StandaloneApp()
908
+ if hasattr(app, 'root') and app.root.winfo_exists():
909
+ app.run()