supervertaler 1.9.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +47886 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1878 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +333 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1172 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.153.dist-info/METADATA +896 -0
- supervertaler-1.9.153.dist-info/RECORD +85 -0
- supervertaler-1.9.153.dist-info/WHEEL +5 -0
- supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.153.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,909 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF Rescue Module
|
|
3
|
+
Embeddable version of the AI-powered OCR tool for extracting text from poorly formatted PDFs
|
|
4
|
+
Uses OpenAI's GPT-4 Vision API
|
|
5
|
+
|
|
6
|
+
This module can be embedded in the main Supervertaler application as a tab.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import base64
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import tkinter as tk
|
|
13
|
+
from tkinter import filedialog, messagebox, scrolledtext, ttk
|
|
14
|
+
from openai import OpenAI
|
|
15
|
+
from docx import Document
|
|
16
|
+
from docx.shared import Pt
|
|
17
|
+
import fitz # PyMuPDF
|
|
18
|
+
import re
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PDFRescue:
|
|
22
|
+
"""
|
|
23
|
+
PDF Rescue feature - extract text from images using AI OCR
|
|
24
|
+
Can be embedded in any tkinter application as a tab or panel
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, parent_app):
|
|
28
|
+
"""
|
|
29
|
+
Initialize PDF Rescue module
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
parent_app: Reference to the main application (needs .api_keys attribute)
|
|
33
|
+
"""
|
|
34
|
+
self.parent_app = parent_app
|
|
35
|
+
self.client = None
|
|
36
|
+
self.image_files = []
|
|
37
|
+
self.extracted_texts = {}
|
|
38
|
+
|
|
39
|
+
# Initialize OpenAI client
|
|
40
|
+
api_key = None
|
|
41
|
+
if hasattr(parent_app, 'api_keys'):
|
|
42
|
+
api_key = parent_app.api_keys.get('openai')
|
|
43
|
+
elif hasattr(parent_app, 'api_key'):
|
|
44
|
+
api_key = parent_app.api_key
|
|
45
|
+
|
|
46
|
+
if api_key:
|
|
47
|
+
try:
|
|
48
|
+
self.client = OpenAI(api_key=api_key)
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print(f"Failed to initialize OpenAI client: {e}")
|
|
51
|
+
|
|
52
|
+
def log_message(self, message: str):
|
|
53
|
+
"""Log a message to the parent app's log if available"""
|
|
54
|
+
if hasattr(self.parent_app, 'log'):
|
|
55
|
+
self.parent_app.log(f"[PDF Rescue] {message}")
|
|
56
|
+
else:
|
|
57
|
+
print(f"[PDF Rescue] {message}")
|
|
58
|
+
|
|
59
|
+
def create_tab(self, parent):
|
|
60
|
+
"""
|
|
61
|
+
Create the PDF Rescue tab UI
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
parent: The parent widget (notebook tab or frame)
|
|
65
|
+
"""
|
|
66
|
+
# Save current state before recreating UI
|
|
67
|
+
saved_files = self.image_files.copy() if hasattr(self, 'image_files') else []
|
|
68
|
+
saved_texts = self.extracted_texts.copy() if hasattr(self, 'extracted_texts') else {}
|
|
69
|
+
|
|
70
|
+
# Header
|
|
71
|
+
header_frame = tk.Frame(parent, bg='#e3f2fd', relief='solid', borderwidth=1)
|
|
72
|
+
header_frame.pack(fill='x', padx=5, pady=5)
|
|
73
|
+
|
|
74
|
+
tk.Label(header_frame, text="🔍 PDF Rescue - AI-Powered OCR",
|
|
75
|
+
font=('Segoe UI', 10, 'bold'), bg='#e3f2fd').pack(side='left', padx=10, pady=5)
|
|
76
|
+
|
|
77
|
+
tk.Label(header_frame, text="Extract text from poorly formatted PDF screenshots",
|
|
78
|
+
font=('Segoe UI', 9), bg='#e3f2fd', fg='#666').pack(side='left', padx=(0, 10), pady=5)
|
|
79
|
+
|
|
80
|
+
# Split view: Files on left, Preview on right
|
|
81
|
+
paned = ttk.PanedWindow(parent, orient='horizontal')
|
|
82
|
+
paned.pack(fill='both', expand=True, padx=5, pady=5)
|
|
83
|
+
|
|
84
|
+
# LEFT: File list
|
|
85
|
+
left_frame = tk.Frame(paned)
|
|
86
|
+
paned.add(left_frame, weight=1)
|
|
87
|
+
|
|
88
|
+
tk.Label(left_frame, text="Images to Process",
|
|
89
|
+
font=('Segoe UI', 9, 'bold')).pack(anchor='w', pady=(0, 5))
|
|
90
|
+
|
|
91
|
+
# File list with scrollbar
|
|
92
|
+
list_container = tk.Frame(left_frame)
|
|
93
|
+
list_container.pack(fill='both', expand=True)
|
|
94
|
+
|
|
95
|
+
scroll = tk.Scrollbar(list_container, orient='vertical')
|
|
96
|
+
scroll.pack(side='right', fill='y')
|
|
97
|
+
|
|
98
|
+
self.file_listbox = tk.Listbox(list_container, yscrollcommand=scroll.set,
|
|
99
|
+
font=('Consolas', 9))
|
|
100
|
+
self.file_listbox.pack(fill='both', expand=True)
|
|
101
|
+
scroll.config(command=self.file_listbox.yview)
|
|
102
|
+
self.file_listbox.bind('<<ListboxSelect>>', self._on_file_select)
|
|
103
|
+
|
|
104
|
+
# Buttons
|
|
105
|
+
btn_frame = tk.Frame(left_frame)
|
|
106
|
+
btn_frame.pack(fill='x', pady=(10, 0))
|
|
107
|
+
|
|
108
|
+
tk.Button(btn_frame, text="📄 PDF", command=self._import_from_pdf,
|
|
109
|
+
bg='#9C27B0', fg='white', font=('Segoe UI', 8, 'bold'),
|
|
110
|
+
padx=8, pady=4).pack(side='left', padx=(0, 3))
|
|
111
|
+
|
|
112
|
+
tk.Button(btn_frame, text="➕ Add Files", command=self._add_files,
|
|
113
|
+
bg='#2196F3', fg='white', font=('Segoe UI', 8, 'bold'),
|
|
114
|
+
padx=8, pady=4).pack(side='left', padx=3)
|
|
115
|
+
|
|
116
|
+
tk.Button(btn_frame, text="📂 Folder", command=self._add_folder,
|
|
117
|
+
bg='#2196F3', fg='white', font=('Segoe UI', 8, 'bold'),
|
|
118
|
+
padx=8, pady=4).pack(side='left', padx=3)
|
|
119
|
+
|
|
120
|
+
tk.Button(btn_frame, text="Clear", command=self._clear_list,
|
|
121
|
+
bg='#9E9E9E', fg='white', font=('Segoe UI', 8),
|
|
122
|
+
padx=8, pady=4).pack(side='left', padx=3)
|
|
123
|
+
|
|
124
|
+
# RIGHT: Text preview
|
|
125
|
+
right_frame = tk.Frame(paned)
|
|
126
|
+
paned.add(right_frame, weight=2)
|
|
127
|
+
|
|
128
|
+
tk.Label(right_frame, text="Extracted Text Preview",
|
|
129
|
+
font=('Segoe UI', 9, 'bold')).pack(anchor='w', pady=(0, 5))
|
|
130
|
+
|
|
131
|
+
self.preview_text = scrolledtext.ScrolledText(right_frame, wrap='word',
|
|
132
|
+
font=('Segoe UI', 9),
|
|
133
|
+
height=15)
|
|
134
|
+
self.preview_text.pack(fill='both', expand=True)
|
|
135
|
+
|
|
136
|
+
# Processing options
|
|
137
|
+
options_frame = tk.LabelFrame(parent, text="Processing Options",
|
|
138
|
+
padx=10, pady=10)
|
|
139
|
+
options_frame.pack(fill='x', padx=5, pady=(0, 10))
|
|
140
|
+
|
|
141
|
+
# Model selection and formatting option
|
|
142
|
+
model_frame = tk.Frame(options_frame)
|
|
143
|
+
model_frame.pack(fill='x', pady=(0, 5))
|
|
144
|
+
|
|
145
|
+
tk.Label(model_frame, text="Model:", font=('Segoe UI', 9)).pack(side='left', padx=(0, 5))
|
|
146
|
+
self.model_var = tk.StringVar(value="gpt-4o")
|
|
147
|
+
models = ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo"]
|
|
148
|
+
ttk.Combobox(model_frame, textvariable=self.model_var, values=models,
|
|
149
|
+
width=20, state='readonly').pack(side='left')
|
|
150
|
+
|
|
151
|
+
# Formatting option
|
|
152
|
+
self.preserve_formatting_var = tk.BooleanVar(value=True)
|
|
153
|
+
tk.Checkbutton(model_frame, text="Preserve formatting (bold/italic/underline)",
|
|
154
|
+
variable=self.preserve_formatting_var,
|
|
155
|
+
font=('Segoe UI', 9)).pack(side='left', padx=(20, 0))
|
|
156
|
+
|
|
157
|
+
# Custom instructions
|
|
158
|
+
instructions_header = tk.Frame(options_frame)
|
|
159
|
+
instructions_header.pack(fill='x', pady=(5, 2))
|
|
160
|
+
|
|
161
|
+
tk.Label(instructions_header, text="Extraction Instructions:",
|
|
162
|
+
font=('Segoe UI', 9)).pack(side='left')
|
|
163
|
+
|
|
164
|
+
tk.Button(instructions_header, text="👁️ Show Prompt",
|
|
165
|
+
command=self._show_full_prompt,
|
|
166
|
+
bg='#9C27B0', fg='white', font=('Segoe UI', 8),
|
|
167
|
+
padx=8, pady=2).pack(side='right')
|
|
168
|
+
|
|
169
|
+
self.instructions_text = scrolledtext.ScrolledText(options_frame, wrap='word',
|
|
170
|
+
font=('Segoe UI', 9),
|
|
171
|
+
height=3)
|
|
172
|
+
self.instructions_text.pack(fill='x')
|
|
173
|
+
|
|
174
|
+
default_instructions = """Extract all text from this image. The image is a screenshot from a poorly formatted PDF.
|
|
175
|
+
Please:
|
|
176
|
+
- Extract all visible text accurately
|
|
177
|
+
- Fix any obvious OCR errors or formatting issues
|
|
178
|
+
- Remove extraneous line breaks within paragraphs
|
|
179
|
+
- Preserve intentional paragraph breaks
|
|
180
|
+
- Maintain the logical flow and structure of the content
|
|
181
|
+
- For redacted/blacked-out text: insert a descriptive placeholder in square brackets in the document's language (e.g., [naam] for Dutch names, [name] for English names, [bedrag] for amounts, etc.)
|
|
182
|
+
- For stamps, signatures, or images: insert a descriptive placeholder in square brackets in the document's language (e.g., [handtekening], [stempel], [signature], [stamp], etc.)
|
|
183
|
+
- For any non-text elements that would normally appear: describe them briefly in square brackets
|
|
184
|
+
- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__
|
|
185
|
+
- Output clean, readable text only (no commentary)"""
|
|
186
|
+
|
|
187
|
+
self.instructions_text.insert('1.0', default_instructions)
|
|
188
|
+
|
|
189
|
+
# Action buttons
|
|
190
|
+
action_frame = tk.Frame(parent, bg='white')
|
|
191
|
+
action_frame.pack(fill='x', padx=5, pady=(0, 10))
|
|
192
|
+
|
|
193
|
+
tk.Button(action_frame, text="🔍 Process Selected",
|
|
194
|
+
command=self._process_selected,
|
|
195
|
+
bg='#FF9800', fg='white', font=('Segoe UI', 9, 'bold'),
|
|
196
|
+
padx=15, pady=6).pack(side='left', padx=(0, 5))
|
|
197
|
+
|
|
198
|
+
tk.Button(action_frame, text="⚡ Process ALL",
|
|
199
|
+
command=self._process_all,
|
|
200
|
+
bg='#4CAF50', fg='white', font=('Segoe UI', 9, 'bold'),
|
|
201
|
+
padx=15, pady=6).pack(side='left', padx=5)
|
|
202
|
+
|
|
203
|
+
tk.Button(action_frame, text="💾 Save DOCX",
|
|
204
|
+
command=self._save_to_docx,
|
|
205
|
+
bg='#2196F3', fg='white', font=('Segoe UI', 9, 'bold'),
|
|
206
|
+
padx=15, pady=6).pack(side='left', padx=5)
|
|
207
|
+
|
|
208
|
+
tk.Button(action_frame, text="📋 Copy All",
|
|
209
|
+
command=self._copy_all_text,
|
|
210
|
+
bg='#607D8B', fg='white', font=('Segoe UI', 9, 'bold'),
|
|
211
|
+
padx=15, pady=6).pack(side='left', padx=5)
|
|
212
|
+
|
|
213
|
+
tk.Button(action_frame, text="📊 Session Report",
|
|
214
|
+
command=self._save_session_report,
|
|
215
|
+
bg='#795548', fg='white', font=('Segoe UI', 9, 'bold'),
|
|
216
|
+
padx=15, pady=6).pack(side='left', padx=5)
|
|
217
|
+
|
|
218
|
+
# Status
|
|
219
|
+
self.status_label = tk.Label(parent, text="Ready - Add images to begin",
|
|
220
|
+
font=('Segoe UI', 9), fg='#666', bg='white')
|
|
221
|
+
self.status_label.pack(pady=(0, 5))
|
|
222
|
+
|
|
223
|
+
# Progress bar
|
|
224
|
+
self.progress = ttk.Progressbar(parent, mode='determinate')
|
|
225
|
+
self.progress.pack(fill='x', padx=5, pady=(0, 5))
|
|
226
|
+
|
|
227
|
+
# Restore state after UI recreation
|
|
228
|
+
self.image_files = saved_files
|
|
229
|
+
self.extracted_texts = saved_texts
|
|
230
|
+
if self.image_files:
|
|
231
|
+
self._update_listbox()
|
|
232
|
+
|
|
233
|
+
# === File Management Methods ===
|
|
234
|
+
|
|
235
|
+
def _import_from_pdf(self):
|
|
236
|
+
"""Import images directly from a PDF file"""
|
|
237
|
+
pdf_file = filedialog.askopenfilename(
|
|
238
|
+
title="Select PDF File",
|
|
239
|
+
filetypes=[
|
|
240
|
+
("PDF files", "*.pdf"),
|
|
241
|
+
("All files", "*.*")
|
|
242
|
+
]
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if not pdf_file:
|
|
246
|
+
return
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
# Open PDF
|
|
250
|
+
doc = fitz.open(pdf_file)
|
|
251
|
+
total_pages = len(doc)
|
|
252
|
+
|
|
253
|
+
if total_pages == 0:
|
|
254
|
+
messagebox.showwarning("Empty PDF", "The selected PDF has no pages.")
|
|
255
|
+
return
|
|
256
|
+
|
|
257
|
+
# Create folder for extracted images next to the PDF
|
|
258
|
+
pdf_path = Path(pdf_file)
|
|
259
|
+
pdf_name = pdf_path.stem
|
|
260
|
+
images_folder = pdf_path.parent / f"{pdf_name}_images"
|
|
261
|
+
|
|
262
|
+
# Create folder if it doesn't exist
|
|
263
|
+
images_folder.mkdir(exist_ok=True)
|
|
264
|
+
temp_dir = str(images_folder)
|
|
265
|
+
|
|
266
|
+
# Log start
|
|
267
|
+
if hasattr(self, 'log_message'):
|
|
268
|
+
self.log_message(f"Starting PDF import: {Path(pdf_file).name}")
|
|
269
|
+
self.log_message(f"Total pages: {total_pages}")
|
|
270
|
+
|
|
271
|
+
# Extract each page as an image
|
|
272
|
+
extracted_count = 0
|
|
273
|
+
self.status_label.config(text=f"Extracting pages from PDF...")
|
|
274
|
+
self.parent_app.root.update_idletasks()
|
|
275
|
+
|
|
276
|
+
for page_num in range(total_pages):
|
|
277
|
+
page = doc[page_num]
|
|
278
|
+
|
|
279
|
+
# Render page to pixmap (image) at 2x resolution for better quality
|
|
280
|
+
zoom = 2.0
|
|
281
|
+
mat = fitz.Matrix(zoom, zoom)
|
|
282
|
+
pix = page.get_pixmap(matrix=mat)
|
|
283
|
+
|
|
284
|
+
# Save as PNG
|
|
285
|
+
img_filename = f"{pdf_name}_page_{page_num + 1:03d}.png"
|
|
286
|
+
img_path = os.path.join(temp_dir, img_filename)
|
|
287
|
+
pix.save(img_path)
|
|
288
|
+
|
|
289
|
+
# Add to image list
|
|
290
|
+
if img_path not in self.image_files:
|
|
291
|
+
self.image_files.append(img_path)
|
|
292
|
+
extracted_count += 1
|
|
293
|
+
|
|
294
|
+
# Log each page
|
|
295
|
+
if hasattr(self, 'log_message'):
|
|
296
|
+
self.log_message(f" Page {page_num + 1}/{total_pages} extracted: {img_filename}")
|
|
297
|
+
|
|
298
|
+
# Update progress
|
|
299
|
+
self.status_label.config(
|
|
300
|
+
text=f"Extracting page {page_num + 1}/{total_pages}..."
|
|
301
|
+
)
|
|
302
|
+
self.parent_app.root.update_idletasks()
|
|
303
|
+
|
|
304
|
+
doc.close()
|
|
305
|
+
|
|
306
|
+
# Update UI
|
|
307
|
+
self._update_listbox()
|
|
308
|
+
self.status_label.config(
|
|
309
|
+
text=f"Imported {extracted_count} page(s) from PDF"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Log completion
|
|
313
|
+
if hasattr(self, 'log_message'):
|
|
314
|
+
self.log_message(f"PDF import complete: {extracted_count} pages extracted")
|
|
315
|
+
self.log_message(f"Temporary folder: {temp_dir}")
|
|
316
|
+
|
|
317
|
+
messagebox.showinfo(
|
|
318
|
+
"PDF Import Complete",
|
|
319
|
+
f"Successfully extracted {extracted_count} page(s) from:\n{Path(pdf_file).name}\n\n"
|
|
320
|
+
f"Images saved to folder:\n{temp_dir}\n\n"
|
|
321
|
+
f"These images are kept for your reference and can be useful for the end client.\n\n"
|
|
322
|
+
f"You can now process these pages with AI OCR."
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
except Exception as e:
|
|
326
|
+
messagebox.showerror("PDF Import Error", f"Failed to import PDF:\n{str(e)}")
|
|
327
|
+
self.status_label.config(text="PDF import failed")
|
|
328
|
+
|
|
329
|
+
def _add_files(self):
|
|
330
|
+
"""Add individual image files"""
|
|
331
|
+
files = filedialog.askopenfilenames(
|
|
332
|
+
title="Select Image Files",
|
|
333
|
+
filetypes=[
|
|
334
|
+
("Image files", "*.jpg *.jpeg *.png *.bmp *.gif *.tiff"),
|
|
335
|
+
("All files", "*.*")
|
|
336
|
+
]
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
if files:
|
|
340
|
+
for file in files:
|
|
341
|
+
if file not in self.image_files:
|
|
342
|
+
self.image_files.append(file)
|
|
343
|
+
self._update_listbox()
|
|
344
|
+
self.status_label.config(text=f"Added {len(files)} file(s)")
|
|
345
|
+
|
|
346
|
+
def _add_folder(self):
|
|
347
|
+
"""Add all images from a folder"""
|
|
348
|
+
folder = filedialog.askdirectory(title="Select Folder with Images")
|
|
349
|
+
|
|
350
|
+
if folder:
|
|
351
|
+
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}
|
|
352
|
+
files = []
|
|
353
|
+
|
|
354
|
+
for file in sorted(os.listdir(folder)):
|
|
355
|
+
file_path = os.path.join(folder, file)
|
|
356
|
+
if os.path.isfile(file_path):
|
|
357
|
+
ext = os.path.splitext(file)[1].lower()
|
|
358
|
+
if ext in image_extensions and file_path not in self.image_files:
|
|
359
|
+
files.append(file_path)
|
|
360
|
+
|
|
361
|
+
self.image_files.extend(files)
|
|
362
|
+
self._update_listbox()
|
|
363
|
+
self.status_label.config(text=f"Added {len(files)} file(s) from folder")
|
|
364
|
+
|
|
365
|
+
def _clear_list(self):
|
|
366
|
+
"""Clear all files"""
|
|
367
|
+
if self.image_files and messagebox.askyesno("Clear", "Remove all files?"):
|
|
368
|
+
self.image_files = []
|
|
369
|
+
self.extracted_texts = {}
|
|
370
|
+
self._update_listbox()
|
|
371
|
+
self.preview_text.delete('1.0', tk.END)
|
|
372
|
+
self.status_label.config(text="List cleared")
|
|
373
|
+
|
|
374
|
+
def _update_listbox(self):
|
|
375
|
+
"""Update file listbox"""
|
|
376
|
+
self.file_listbox.delete(0, tk.END)
|
|
377
|
+
for i, file in enumerate(self.image_files, 1):
|
|
378
|
+
filename = os.path.basename(file)
|
|
379
|
+
status = "✓ " if file in self.extracted_texts else ""
|
|
380
|
+
self.file_listbox.insert(tk.END, f"{status}{i:2d}. {filename}")
|
|
381
|
+
|
|
382
|
+
def _on_file_select(self, event):
|
|
383
|
+
"""Show extracted text when file is selected"""
|
|
384
|
+
selection = self.file_listbox.curselection()
|
|
385
|
+
if not selection:
|
|
386
|
+
return
|
|
387
|
+
|
|
388
|
+
idx = selection[0]
|
|
389
|
+
if idx < len(self.image_files):
|
|
390
|
+
file = self.image_files[idx]
|
|
391
|
+
if file in self.extracted_texts:
|
|
392
|
+
self.preview_text.delete('1.0', tk.END)
|
|
393
|
+
self.preview_text.insert('1.0', self.extracted_texts[file])
|
|
394
|
+
|
|
395
|
+
def _show_full_prompt(self):
|
|
396
|
+
"""Show the exact prompt that will be sent to the AI"""
|
|
397
|
+
instructions = self.instructions_text.get('1.0', 'end-1c').strip()
|
|
398
|
+
|
|
399
|
+
# Apply formatting modifications like in _extract_text_from_image
|
|
400
|
+
if self.preserve_formatting_var.get():
|
|
401
|
+
if "markdown for text formatting" not in instructions:
|
|
402
|
+
instructions += "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__"
|
|
403
|
+
else:
|
|
404
|
+
instructions = instructions.replace(
|
|
405
|
+
"\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
|
|
406
|
+
).replace(
|
|
407
|
+
"- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Create popup window
|
|
411
|
+
popup = tk.Toplevel()
|
|
412
|
+
popup.title("Full Prompt Preview")
|
|
413
|
+
popup.geometry("700x600")
|
|
414
|
+
|
|
415
|
+
# Main frame
|
|
416
|
+
main_frame = tk.Frame(popup, padx=15, pady=15)
|
|
417
|
+
main_frame.pack(fill='both', expand=True)
|
|
418
|
+
|
|
419
|
+
# Title
|
|
420
|
+
title = tk.Label(main_frame, text="Exact Prompt Sent to OpenAI API",
|
|
421
|
+
font=('Segoe UI', 12, 'bold'))
|
|
422
|
+
title.pack(pady=(0, 10))
|
|
423
|
+
|
|
424
|
+
# Info frame
|
|
425
|
+
info_frame = tk.LabelFrame(main_frame, text="Configuration", padx=10, pady=10)
|
|
426
|
+
info_frame.pack(fill='x', pady=(0, 10))
|
|
427
|
+
|
|
428
|
+
tk.Label(info_frame, text=f"Model: {self.model_var.get()}",
|
|
429
|
+
font=('Segoe UI', 9, 'bold')).pack(anchor='w')
|
|
430
|
+
|
|
431
|
+
formatting_status = "✓ Enabled" if self.preserve_formatting_var.get() else "✗ Disabled"
|
|
432
|
+
tk.Label(info_frame, text=f"Formatting Preservation: {formatting_status}",
|
|
433
|
+
font=('Segoe UI', 9)).pack(anchor='w')
|
|
434
|
+
|
|
435
|
+
tk.Label(info_frame, text=f"Max Tokens: 4000",
|
|
436
|
+
font=('Segoe UI', 9)).pack(anchor='w')
|
|
437
|
+
|
|
438
|
+
# Prompt text
|
|
439
|
+
prompt_frame = tk.LabelFrame(main_frame, text="Full Instructions Text",
|
|
440
|
+
padx=10, pady=10)
|
|
441
|
+
prompt_frame.pack(fill='both', expand=True, pady=(0, 10))
|
|
442
|
+
|
|
443
|
+
prompt_text = scrolledtext.ScrolledText(prompt_frame, wrap='word',
|
|
444
|
+
font=('Consolas', 9))
|
|
445
|
+
prompt_text.pack(fill='both', expand=True)
|
|
446
|
+
prompt_text.insert('1.0', instructions)
|
|
447
|
+
prompt_text.config(state='disabled')
|
|
448
|
+
|
|
449
|
+
# Note
|
|
450
|
+
note = tk.Label(main_frame,
|
|
451
|
+
text="Note: The image is sent as base64-encoded data along with these instructions.",
|
|
452
|
+
font=('Segoe UI', 8), fg='#666')
|
|
453
|
+
note.pack(pady=(0, 5))
|
|
454
|
+
|
|
455
|
+
# Close button
|
|
456
|
+
tk.Button(main_frame, text="Close", command=popup.destroy,
|
|
457
|
+
bg='#607D8B', fg='white', font=('Segoe UI', 9, 'bold'),
|
|
458
|
+
padx=20, pady=6).pack()
|
|
459
|
+
|
|
460
|
+
def _save_session_report(self):
|
|
461
|
+
"""Generate and save a session report in markdown format"""
|
|
462
|
+
if not self.extracted_texts:
|
|
463
|
+
messagebox.showwarning("No Data", "No OCR processing has been performed yet.\n\n"
|
|
464
|
+
"Process some images first to generate a session report.")
|
|
465
|
+
return
|
|
466
|
+
|
|
467
|
+
# Ask for save location
|
|
468
|
+
output_file = filedialog.asksaveasfilename(
|
|
469
|
+
title="Save Session Report",
|
|
470
|
+
defaultextension=".md",
|
|
471
|
+
filetypes=[("Markdown files", "*.md"), ("Text files", "*.txt"), ("All files", "*.*")],
|
|
472
|
+
initialfile="PDF_Rescue_SessionReport.md"
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
if not output_file:
|
|
476
|
+
return
|
|
477
|
+
|
|
478
|
+
try:
|
|
479
|
+
from datetime import datetime
|
|
480
|
+
|
|
481
|
+
# Generate report content
|
|
482
|
+
report_lines = []
|
|
483
|
+
report_lines.append("# PDF Rescue - Session Report\n")
|
|
484
|
+
report_lines.append("**Generated by [Supervertaler](https://supervertaler.com/) • by Michael Beijer**\n\n")
|
|
485
|
+
report_lines.append(f"**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
486
|
+
report_lines.append("---\n\n")
|
|
487
|
+
|
|
488
|
+
# Configuration section
|
|
489
|
+
report_lines.append("## Configuration\n\n")
|
|
490
|
+
report_lines.append(f"- **Model**: {self.model_var.get()}\n")
|
|
491
|
+
formatting_status = "Enabled ✓" if self.preserve_formatting_var.get() else "Disabled ✗"
|
|
492
|
+
report_lines.append(f"- **Formatting Preservation**: {formatting_status}\n")
|
|
493
|
+
report_lines.append(f"- **Total Images Processed**: {len(self.extracted_texts)}\n")
|
|
494
|
+
report_lines.append(f"- **Total Images in List**: {len(self.image_files)}\n\n")
|
|
495
|
+
|
|
496
|
+
# Instructions used
|
|
497
|
+
report_lines.append("## Extraction Instructions\n\n")
|
|
498
|
+
report_lines.append("```\n")
|
|
499
|
+
instructions = self.instructions_text.get('1.0', 'end-1c').strip()
|
|
500
|
+
if self.preserve_formatting_var.get():
|
|
501
|
+
if "markdown for text formatting" not in instructions:
|
|
502
|
+
instructions += "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__"
|
|
503
|
+
report_lines.append(instructions)
|
|
504
|
+
report_lines.append("\n```\n\n")
|
|
505
|
+
|
|
506
|
+
# Processing summary
|
|
507
|
+
report_lines.append("## Processing Summary\n\n")
|
|
508
|
+
report_lines.append("| # | Image File | Status |\n")
|
|
509
|
+
report_lines.append("|---|------------|--------|\n")
|
|
510
|
+
|
|
511
|
+
for i, file in enumerate(self.image_files, 1):
|
|
512
|
+
filename = os.path.basename(file)
|
|
513
|
+
status = "✓ Processed" if file in self.extracted_texts else "⧗ Pending"
|
|
514
|
+
report_lines.append(f"| {i} | {filename} | {status} |\n")
|
|
515
|
+
|
|
516
|
+
report_lines.append("\n---\n\n")
|
|
517
|
+
|
|
518
|
+
# Extracted text for each image
|
|
519
|
+
report_lines.append("## Extracted Text\n\n")
|
|
520
|
+
|
|
521
|
+
for i, file in enumerate(self.image_files, 1):
|
|
522
|
+
if file in self.extracted_texts:
|
|
523
|
+
filename = os.path.basename(file)
|
|
524
|
+
report_lines.append(f"### Page {i}: {filename}\n\n")
|
|
525
|
+
report_lines.append("```\n")
|
|
526
|
+
report_lines.append(self.extracted_texts[file])
|
|
527
|
+
report_lines.append("\n```\n\n")
|
|
528
|
+
report_lines.append("---\n\n")
|
|
529
|
+
|
|
530
|
+
# Statistics
|
|
531
|
+
report_lines.append("## Statistics\n\n")
|
|
532
|
+
total_chars = sum(len(text) for text in self.extracted_texts.values())
|
|
533
|
+
total_words = sum(len(text.split()) for text in self.extracted_texts.values())
|
|
534
|
+
report_lines.append(f"- **Total Characters Extracted**: {total_chars:,}\n")
|
|
535
|
+
report_lines.append(f"- **Total Words Extracted**: {total_words:,}\n")
|
|
536
|
+
report_lines.append(f"- **Average Characters per Page**: {total_chars // len(self.extracted_texts) if self.extracted_texts else 0:,}\n")
|
|
537
|
+
report_lines.append(f"- **Average Words per Page**: {total_words // len(self.extracted_texts) if self.extracted_texts else 0:,}\n\n")
|
|
538
|
+
|
|
539
|
+
# Footer
|
|
540
|
+
report_lines.append("---\n\n")
|
|
541
|
+
report_lines.append("*Report generated by **PDF Rescue** - AI-Powered OCR Tool*\n\n")
|
|
542
|
+
report_lines.append("*Part of [**Supervertaler**](https://supervertaler.com/) • by Michael Beijer*\n")
|
|
543
|
+
|
|
544
|
+
# Write to file
|
|
545
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
546
|
+
f.writelines(report_lines)
|
|
547
|
+
|
|
548
|
+
self.log_message(f"Session report saved: {Path(output_file).name}")
|
|
549
|
+
self.status_label.config(text=f"✓ Report saved to {os.path.basename(output_file)}")
|
|
550
|
+
|
|
551
|
+
if messagebox.askyesno("Success",
|
|
552
|
+
f"Session report saved successfully!\n\n"
|
|
553
|
+
f"File: {Path(output_file).name}\n\n"
|
|
554
|
+
"Open the report now?"):
|
|
555
|
+
os.startfile(output_file)
|
|
556
|
+
|
|
557
|
+
except Exception as e:
|
|
558
|
+
messagebox.showerror("Error", f"Failed to save report:\n\n{str(e)}")
|
|
559
|
+
|
|
560
|
+
# === OCR Processing Methods ===
|
|
561
|
+
|
|
562
|
+
def _encode_image(self, image_path):
|
|
563
|
+
"""Encode image to base64"""
|
|
564
|
+
with open(image_path, "rb") as image_file:
|
|
565
|
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
|
566
|
+
|
|
567
|
+
def _extract_text_from_image(self, image_path):
|
|
568
|
+
"""Use GPT-4 Vision to extract text from image"""
|
|
569
|
+
if not self.client:
|
|
570
|
+
return "[ERROR: OpenAI client not initialized. Check API key.]"
|
|
571
|
+
|
|
572
|
+
try:
|
|
573
|
+
base64_image = self._encode_image(image_path)
|
|
574
|
+
instructions = self.instructions_text.get('1.0', 'end-1c').strip()
|
|
575
|
+
|
|
576
|
+
# Add or remove formatting instruction based on checkbox
|
|
577
|
+
if self.preserve_formatting_var.get():
|
|
578
|
+
if "markdown for text formatting" not in instructions:
|
|
579
|
+
instructions += "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__"
|
|
580
|
+
else:
|
|
581
|
+
# Remove markdown instruction if present
|
|
582
|
+
instructions = instructions.replace(
|
|
583
|
+
"\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
|
|
584
|
+
).replace(
|
|
585
|
+
"- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
response = self.client.chat.completions.create(
|
|
589
|
+
model=self.model_var.get(),
|
|
590
|
+
messages=[
|
|
591
|
+
{
|
|
592
|
+
"role": "user",
|
|
593
|
+
"content": [
|
|
594
|
+
{
|
|
595
|
+
"type": "text",
|
|
596
|
+
"text": instructions
|
|
597
|
+
},
|
|
598
|
+
{
|
|
599
|
+
"type": "image_url",
|
|
600
|
+
"image_url": {
|
|
601
|
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
]
|
|
605
|
+
}
|
|
606
|
+
],
|
|
607
|
+
max_tokens=4000
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
return response.choices[0].message.content
|
|
611
|
+
|
|
612
|
+
except Exception as e:
|
|
613
|
+
return f"[ERROR extracting text: {str(e)}]"
|
|
614
|
+
|
|
615
|
+
def _process_selected(self):
|
|
616
|
+
"""Process currently selected image"""
|
|
617
|
+
selection = self.file_listbox.curselection()
|
|
618
|
+
if not selection:
|
|
619
|
+
messagebox.showwarning("No Selection", "Please select an image to process")
|
|
620
|
+
return
|
|
621
|
+
|
|
622
|
+
idx = selection[0]
|
|
623
|
+
if idx >= len(self.image_files):
|
|
624
|
+
return
|
|
625
|
+
|
|
626
|
+
file = self.image_files[idx]
|
|
627
|
+
filename = os.path.basename(file)
|
|
628
|
+
|
|
629
|
+
self.log_message(f"Processing selected image: {filename}")
|
|
630
|
+
self.status_label.config(text=f"Processing {filename}...")
|
|
631
|
+
if hasattr(self.parent_app, 'root'):
|
|
632
|
+
self.parent_app.root.update()
|
|
633
|
+
|
|
634
|
+
text = self._extract_text_from_image(file)
|
|
635
|
+
self.extracted_texts[file] = text
|
|
636
|
+
|
|
637
|
+
self.preview_text.delete('1.0', tk.END)
|
|
638
|
+
self.preview_text.insert('1.0', text)
|
|
639
|
+
|
|
640
|
+
self._update_listbox()
|
|
641
|
+
self.log_message(f"Successfully processed: {filename}")
|
|
642
|
+
self.status_label.config(text=f"✓ Processed {filename}")
|
|
643
|
+
|
|
644
|
+
def _process_all(self):
|
|
645
|
+
"""Process all images in the list"""
|
|
646
|
+
if not self.image_files:
|
|
647
|
+
messagebox.showwarning("No Files", "Please add images first")
|
|
648
|
+
return
|
|
649
|
+
|
|
650
|
+
if not messagebox.askyesno("Process All",
|
|
651
|
+
f"Process all {len(self.image_files)} images?\n\n"
|
|
652
|
+
"This will use API credits and may take several minutes."):
|
|
653
|
+
return
|
|
654
|
+
|
|
655
|
+
self.log_message(f"Starting batch processing: {len(self.image_files)} images")
|
|
656
|
+
self.progress['maximum'] = len(self.image_files)
|
|
657
|
+
self.progress['value'] = 0
|
|
658
|
+
|
|
659
|
+
for i, file in enumerate(self.image_files, 1):
|
|
660
|
+
filename = os.path.basename(file)
|
|
661
|
+
self.status_label.config(text=f"Processing {i}/{len(self.image_files)}: {filename}...")
|
|
662
|
+
if hasattr(self.parent_app, 'root'):
|
|
663
|
+
self.parent_app.root.update()
|
|
664
|
+
|
|
665
|
+
if file not in self.extracted_texts:
|
|
666
|
+
text = self._extract_text_from_image(file)
|
|
667
|
+
self.extracted_texts[file] = text
|
|
668
|
+
self.log_message(f" [{i}/{len(self.image_files)}] Processed: {filename}")
|
|
669
|
+
else:
|
|
670
|
+
self.log_message(f" [{i}/{len(self.image_files)}] Skipped (already processed): {filename}")
|
|
671
|
+
|
|
672
|
+
self.progress['value'] = i
|
|
673
|
+
self._update_listbox()
|
|
674
|
+
|
|
675
|
+
self.log_message(f"Batch processing complete: {len(self.image_files)} images processed")
|
|
676
|
+
self.status_label.config(text=f"✓ Processed all {len(self.image_files)} images!")
|
|
677
|
+
messagebox.showinfo("Complete",
|
|
678
|
+
f"Successfully processed {len(self.image_files)} images!\n\n"
|
|
679
|
+
"Click 'Save DOCX' to export the text.")
|
|
680
|
+
|
|
681
|
+
# === Export Methods ===
|
|
682
|
+
|
|
683
|
+
def _add_formatted_text(self, doc, text):
|
|
684
|
+
"""
|
|
685
|
+
Add text to document with markdown formatting parsed
|
|
686
|
+
Supports: **bold**, *italic*, __underline__
|
|
687
|
+
"""
|
|
688
|
+
# Split text into paragraphs
|
|
689
|
+
paragraphs = text.split('\n')
|
|
690
|
+
|
|
691
|
+
for para_text in paragraphs:
|
|
692
|
+
if not para_text.strip():
|
|
693
|
+
continue
|
|
694
|
+
|
|
695
|
+
para = doc.add_paragraph()
|
|
696
|
+
para.paragraph_format.line_spacing = 1.15
|
|
697
|
+
para.paragraph_format.space_after = Pt(12)
|
|
698
|
+
|
|
699
|
+
# Parse markdown formatting using regex
|
|
700
|
+
# Pattern matches: **bold**, *italic*, __underline__
|
|
701
|
+
# We need to handle nested/overlapping formatting carefully
|
|
702
|
+
position = 0
|
|
703
|
+
|
|
704
|
+
# Combined pattern to find all formatting markers
|
|
705
|
+
pattern = r'(\*\*.*?\*\*|\*.*?\*|__.*?__|.+?(?=\*\*|\*|__|$)|.)'
|
|
706
|
+
|
|
707
|
+
# Simple approach: process sequentially
|
|
708
|
+
remaining = para_text
|
|
709
|
+
|
|
710
|
+
while remaining:
|
|
711
|
+
# Check for bold (**text**)
|
|
712
|
+
bold_match = re.match(r'\*\*(.*?)\*\*', remaining)
|
|
713
|
+
if bold_match:
|
|
714
|
+
run = para.add_run(bold_match.group(1))
|
|
715
|
+
run.bold = True
|
|
716
|
+
remaining = remaining[bold_match.end():]
|
|
717
|
+
continue
|
|
718
|
+
|
|
719
|
+
# Check for underline (__text__)
|
|
720
|
+
underline_match = re.match(r'__(.*?)__', remaining)
|
|
721
|
+
if underline_match:
|
|
722
|
+
run = para.add_run(underline_match.group(1))
|
|
723
|
+
run.underline = True
|
|
724
|
+
remaining = remaining[underline_match.end():]
|
|
725
|
+
continue
|
|
726
|
+
|
|
727
|
+
# Check for italic (*text*)
|
|
728
|
+
italic_match = re.match(r'\*(.*?)\*', remaining)
|
|
729
|
+
if italic_match:
|
|
730
|
+
run = para.add_run(italic_match.group(1))
|
|
731
|
+
run.italic = True
|
|
732
|
+
remaining = remaining[italic_match.end():]
|
|
733
|
+
continue
|
|
734
|
+
|
|
735
|
+
# No formatting - add plain text until next marker or end
|
|
736
|
+
next_marker = len(remaining)
|
|
737
|
+
for marker in ['**', '*', '__']:
|
|
738
|
+
pos = remaining.find(marker)
|
|
739
|
+
if pos != -1 and pos < next_marker:
|
|
740
|
+
next_marker = pos
|
|
741
|
+
|
|
742
|
+
if next_marker == 0:
|
|
743
|
+
# Edge case: marker at start but no match (e.g., single * or **)
|
|
744
|
+
para.add_run(remaining[0])
|
|
745
|
+
remaining = remaining[1:]
|
|
746
|
+
else:
|
|
747
|
+
plain_text = remaining[:next_marker] if next_marker < len(remaining) else remaining
|
|
748
|
+
if plain_text:
|
|
749
|
+
para.add_run(plain_text)
|
|
750
|
+
remaining = remaining[next_marker:]
|
|
751
|
+
|
|
752
|
+
def _save_to_docx(self):
|
|
753
|
+
"""Save all extracted text to a Word document"""
|
|
754
|
+
if not self.extracted_texts:
|
|
755
|
+
messagebox.showwarning("No Text", "No extracted text to save.\n\n"
|
|
756
|
+
"Process images first.")
|
|
757
|
+
return
|
|
758
|
+
|
|
759
|
+
output_file = filedialog.asksaveasfilename(
|
|
760
|
+
title="Save Extracted Text",
|
|
761
|
+
defaultextension=".docx",
|
|
762
|
+
filetypes=[("Word Document", "*.docx"), ("All files", "*.*")],
|
|
763
|
+
initialfile="extracted_text.docx"
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
if not output_file:
|
|
767
|
+
return
|
|
768
|
+
|
|
769
|
+
self.log_message(f"Saving extracted text to DOCX: {Path(output_file).name}")
|
|
770
|
+
|
|
771
|
+
try:
|
|
772
|
+
doc = Document()
|
|
773
|
+
|
|
774
|
+
# Add title
|
|
775
|
+
title = doc.add_heading('Extracted Text from Images', 0)
|
|
776
|
+
title.runs[0].font.size = Pt(16)
|
|
777
|
+
|
|
778
|
+
# Add extracted text in order
|
|
779
|
+
for i, file in enumerate(self.image_files, 1):
|
|
780
|
+
if file in self.extracted_texts:
|
|
781
|
+
# Page header
|
|
782
|
+
heading = doc.add_heading(f'Page {i}: {os.path.basename(file)}', level=2)
|
|
783
|
+
heading.runs[0].font.size = Pt(12)
|
|
784
|
+
|
|
785
|
+
# Text content with formatting
|
|
786
|
+
text = self.extracted_texts[file]
|
|
787
|
+
if self.preserve_formatting_var.get():
|
|
788
|
+
self._add_formatted_text(doc, text)
|
|
789
|
+
else:
|
|
790
|
+
para = doc.add_paragraph(text)
|
|
791
|
+
para.paragraph_format.line_spacing = 1.15
|
|
792
|
+
para.paragraph_format.space_after = Pt(12)
|
|
793
|
+
|
|
794
|
+
# Page break except for last
|
|
795
|
+
if i < len(self.image_files):
|
|
796
|
+
doc.add_page_break()
|
|
797
|
+
|
|
798
|
+
doc.save(output_file)
|
|
799
|
+
|
|
800
|
+
self.log_message(f"Successfully saved {len(self.extracted_texts)} pages to: {Path(output_file).name}")
|
|
801
|
+
self.status_label.config(text=f"✓ Saved to {os.path.basename(output_file)}")
|
|
802
|
+
|
|
803
|
+
if messagebox.askyesno("Success",
|
|
804
|
+
f"Document saved successfully!\n\n"
|
|
805
|
+
f"{len(self.extracted_texts)} pages of text extracted\n\n"
|
|
806
|
+
"Open the document now?"):
|
|
807
|
+
os.startfile(output_file)
|
|
808
|
+
|
|
809
|
+
except Exception as e:
|
|
810
|
+
messagebox.showerror("Error", f"Failed to save document:\n\n{str(e)}")
|
|
811
|
+
|
|
812
|
+
def _copy_all_text(self):
|
|
813
|
+
"""Copy all extracted text to clipboard"""
|
|
814
|
+
if not self.extracted_texts:
|
|
815
|
+
messagebox.showwarning("No Text", "No extracted text to copy")
|
|
816
|
+
return
|
|
817
|
+
|
|
818
|
+
all_text = []
|
|
819
|
+
for i, file in enumerate(self.image_files, 1):
|
|
820
|
+
if file in self.extracted_texts:
|
|
821
|
+
all_text.append(f"=== Page {i}: {os.path.basename(file)} ===\n")
|
|
822
|
+
all_text.append(self.extracted_texts[file])
|
|
823
|
+
all_text.append("\n\n")
|
|
824
|
+
|
|
825
|
+
combined = "".join(all_text)
|
|
826
|
+
|
|
827
|
+
# Get root window from parent app or use clipboard differently
|
|
828
|
+
if hasattr(self.parent_app, 'root'):
|
|
829
|
+
self.parent_app.root.clipboard_clear()
|
|
830
|
+
self.parent_app.root.clipboard_append(combined)
|
|
831
|
+
|
|
832
|
+
self.status_label.config(text=f"✓ Copied {len(self.extracted_texts)} pages to clipboard")
|
|
833
|
+
messagebox.showinfo("Copied", f"Copied text from {len(self.extracted_texts)} pages to clipboard!")
|
|
834
|
+
|
|
835
|
+
|
|
836
|
+
# === Standalone Application ===
|
|
837
|
+
|
|
838
|
+
if __name__ == "__main__":
|
|
839
|
+
"""Run PDF Rescue as a standalone application"""
|
|
840
|
+
|
|
841
|
+
class StandaloneApp:
|
|
842
|
+
"""Minimal parent app for standalone mode"""
|
|
843
|
+
def __init__(self):
|
|
844
|
+
self.root = tk.Tk()
|
|
845
|
+
self.root.title("PDF Rescue - AI-Powered OCR Tool")
|
|
846
|
+
self.root.geometry("1000x700")
|
|
847
|
+
|
|
848
|
+
# Load API key from api_keys.txt
|
|
849
|
+
self.api_keys = {}
|
|
850
|
+
api_file = Path("api_keys.txt")
|
|
851
|
+
if api_file.exists():
|
|
852
|
+
with open(api_file, 'r', encoding='utf-8') as f:
|
|
853
|
+
for line in f:
|
|
854
|
+
line = line.strip()
|
|
855
|
+
if line and not line.startswith('#') and '=' in line:
|
|
856
|
+
key, value = line.split('=', 1)
|
|
857
|
+
if 'openai' in key.lower():
|
|
858
|
+
self.api_keys['openai'] = value.strip()
|
|
859
|
+
|
|
860
|
+
if not self.api_keys.get('openai'):
|
|
861
|
+
messagebox.showerror(
|
|
862
|
+
"API Key Missing",
|
|
863
|
+
"Could not find OpenAI API key in api_keys.txt\n\n"
|
|
864
|
+
"Please add a line like:\nOPENAI_API_KEY=your-key-here"
|
|
865
|
+
)
|
|
866
|
+
self.root.destroy()
|
|
867
|
+
return
|
|
868
|
+
|
|
869
|
+
# Create main container
|
|
870
|
+
main_frame = tk.Frame(self.root)
|
|
871
|
+
main_frame.pack(fill='both', expand=True, padx=10, pady=10)
|
|
872
|
+
|
|
873
|
+
# Add title
|
|
874
|
+
title = tk.Label(main_frame, text="PDF Rescue - AI-Powered OCR Tool",
|
|
875
|
+
font=('Segoe UI', 14, 'bold'))
|
|
876
|
+
title.pack(pady=(0, 10))
|
|
877
|
+
|
|
878
|
+
# Create PDF Rescue instance
|
|
879
|
+
self.pdf_rescue = PDFRescue(self)
|
|
880
|
+
self.pdf_rescue.create_tab(main_frame)
|
|
881
|
+
|
|
882
|
+
# Add log at bottom
|
|
883
|
+
log_frame = tk.LabelFrame(self.root, text="Activity Log", padx=5, pady=5)
|
|
884
|
+
log_frame.pack(fill='x', padx=10, pady=(0, 10))
|
|
885
|
+
|
|
886
|
+
self.log_text = scrolledtext.ScrolledText(log_frame, height=4, wrap='word',
|
|
887
|
+
font=('Consolas', 9))
|
|
888
|
+
self.log_text.pack(fill='both', expand=True)
|
|
889
|
+
self.log_text.config(state='disabled')
|
|
890
|
+
|
|
891
|
+
def log(self, message: str):
|
|
892
|
+
"""Add message to log"""
|
|
893
|
+
from datetime import datetime
|
|
894
|
+
timestamp = datetime.now().strftime("%H:%M:%S")
|
|
895
|
+
formatted_message = f"[{timestamp}] {message}\n"
|
|
896
|
+
|
|
897
|
+
self.log_text.config(state='normal')
|
|
898
|
+
self.log_text.insert('end', formatted_message)
|
|
899
|
+
self.log_text.see('end')
|
|
900
|
+
self.log_text.config(state='disabled')
|
|
901
|
+
|
|
902
|
+
def run(self):
|
|
903
|
+
"""Start the application"""
|
|
904
|
+
self.root.mainloop()
|
|
905
|
+
|
|
906
|
+
# Create and run standalone app
|
|
907
|
+
app = StandaloneApp()
|
|
908
|
+
if hasattr(app, 'root') and app.root.winfo_exists():
|
|
909
|
+
app.run()
|