supervertaler 1.9.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of supervertaler might be problematic. Click here for more details.
- Supervertaler.py +47886 -0
- modules/__init__.py +10 -0
- modules/ai_actions.py +964 -0
- modules/ai_attachment_manager.py +343 -0
- modules/ai_file_viewer_dialog.py +210 -0
- modules/autofingers_engine.py +466 -0
- modules/cafetran_docx_handler.py +379 -0
- modules/config_manager.py +469 -0
- modules/database_manager.py +1878 -0
- modules/database_migrations.py +417 -0
- modules/dejavurtf_handler.py +779 -0
- modules/document_analyzer.py +427 -0
- modules/docx_handler.py +689 -0
- modules/encoding_repair.py +319 -0
- modules/encoding_repair_Qt.py +393 -0
- modules/encoding_repair_ui.py +481 -0
- modules/feature_manager.py +350 -0
- modules/figure_context_manager.py +340 -0
- modules/file_dialog_helper.py +148 -0
- modules/find_replace.py +164 -0
- modules/find_replace_qt.py +457 -0
- modules/glossary_manager.py +433 -0
- modules/image_extractor.py +188 -0
- modules/keyboard_shortcuts_widget.py +571 -0
- modules/llm_clients.py +1211 -0
- modules/llm_leaderboard.py +737 -0
- modules/llm_superbench_ui.py +1401 -0
- modules/local_llm_setup.py +1104 -0
- modules/model_update_dialog.py +381 -0
- modules/model_version_checker.py +373 -0
- modules/mqxliff_handler.py +638 -0
- modules/non_translatables_manager.py +743 -0
- modules/pdf_rescue_Qt.py +1822 -0
- modules/pdf_rescue_tkinter.py +909 -0
- modules/phrase_docx_handler.py +516 -0
- modules/project_home_panel.py +209 -0
- modules/prompt_assistant.py +357 -0
- modules/prompt_library.py +689 -0
- modules/prompt_library_migration.py +447 -0
- modules/quick_access_sidebar.py +282 -0
- modules/ribbon_widget.py +597 -0
- modules/sdlppx_handler.py +874 -0
- modules/setup_wizard.py +353 -0
- modules/shortcut_manager.py +932 -0
- modules/simple_segmenter.py +128 -0
- modules/spellcheck_manager.py +727 -0
- modules/statuses.py +207 -0
- modules/style_guide_manager.py +315 -0
- modules/superbench_ui.py +1319 -0
- modules/superbrowser.py +329 -0
- modules/supercleaner.py +600 -0
- modules/supercleaner_ui.py +444 -0
- modules/superdocs.py +19 -0
- modules/superdocs_viewer_qt.py +382 -0
- modules/superlookup.py +252 -0
- modules/tag_cleaner.py +260 -0
- modules/tag_manager.py +333 -0
- modules/term_extractor.py +270 -0
- modules/termbase_entry_editor.py +842 -0
- modules/termbase_import_export.py +488 -0
- modules/termbase_manager.py +1060 -0
- modules/termview_widget.py +1172 -0
- modules/theme_manager.py +499 -0
- modules/tm_editor_dialog.py +99 -0
- modules/tm_manager_qt.py +1280 -0
- modules/tm_metadata_manager.py +545 -0
- modules/tmx_editor.py +1461 -0
- modules/tmx_editor_qt.py +2784 -0
- modules/tmx_generator.py +284 -0
- modules/tracked_changes.py +900 -0
- modules/trados_docx_handler.py +430 -0
- modules/translation_memory.py +715 -0
- modules/translation_results_panel.py +2134 -0
- modules/translation_services.py +282 -0
- modules/unified_prompt_library.py +659 -0
- modules/unified_prompt_manager_qt.py +3951 -0
- modules/voice_commands.py +920 -0
- modules/voice_dictation.py +477 -0
- modules/voice_dictation_lite.py +249 -0
- supervertaler-1.9.153.dist-info/METADATA +896 -0
- supervertaler-1.9.153.dist-info/RECORD +85 -0
- supervertaler-1.9.153.dist-info/WHEEL +5 -0
- supervertaler-1.9.153.dist-info/entry_points.txt +2 -0
- supervertaler-1.9.153.dist-info/licenses/LICENSE +21 -0
- supervertaler-1.9.153.dist-info/top_level.txt +2 -0
modules/pdf_rescue_Qt.py
ADDED
|
@@ -0,0 +1,1822 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF Rescue Module - Qt Edition
|
|
3
|
+
Embeddable version of the AI-powered OCR tool for extracting text from poorly formatted PDFs
|
|
4
|
+
Supports multiple AI providers: OpenAI GPT-4 Vision, Anthropic Claude Vision, Google Gemini Vision
|
|
5
|
+
|
|
6
|
+
This module can be embedded in the main Supervertaler Qt application as a tab.
|
|
7
|
+
Can also be run independently as a standalone application.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import base64
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from PyQt6.QtWidgets import (
|
|
14
|
+
QWidget, QVBoxLayout, QHBoxLayout, QLabel, QPushButton, QListWidget, QTextEdit,
|
|
15
|
+
QComboBox, QCheckBox, QProgressBar, QFileDialog, QMessageBox, QSplitter,
|
|
16
|
+
QGroupBox, QFrame, QDialog, QDialogButtonBox, QPlainTextEdit, QApplication,
|
|
17
|
+
QStyleOptionButton
|
|
18
|
+
)
|
|
19
|
+
from PyQt6.QtCore import Qt, QTimer, QPointF, QRect
|
|
20
|
+
from PyQt6.QtGui import QFont, QTextOption, QPainter, QPen, QColor, QStandardItemModel, QStandardItem
|
|
21
|
+
from docx import Document
|
|
22
|
+
from docx.shared import Pt
|
|
23
|
+
import fitz # PyMuPDF
|
|
24
|
+
import re
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class CheckmarkCheckBox(QCheckBox):
|
|
28
|
+
"""Custom checkbox with green background and white checkmark when checked - matches Supervertaler Qt style"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, text="", parent=None):
|
|
31
|
+
super().__init__(text, parent)
|
|
32
|
+
self.setStyleSheet("""
|
|
33
|
+
QCheckBox {
|
|
34
|
+
font-size: 9pt;
|
|
35
|
+
spacing: 6px;
|
|
36
|
+
}
|
|
37
|
+
QCheckBox::indicator {
|
|
38
|
+
width: 18px;
|
|
39
|
+
height: 18px;
|
|
40
|
+
border: 2px solid #999;
|
|
41
|
+
border-radius: 3px;
|
|
42
|
+
background-color: white;
|
|
43
|
+
}
|
|
44
|
+
QCheckBox::indicator:checked {
|
|
45
|
+
background-color: #4CAF50;
|
|
46
|
+
border-color: #4CAF50;
|
|
47
|
+
}
|
|
48
|
+
QCheckBox::indicator:hover {
|
|
49
|
+
border-color: #666;
|
|
50
|
+
}
|
|
51
|
+
QCheckBox::indicator:checked:hover {
|
|
52
|
+
background-color: #45a049;
|
|
53
|
+
border-color: #45a049;
|
|
54
|
+
}
|
|
55
|
+
""")
|
|
56
|
+
|
|
57
|
+
def paintEvent(self, a0):
|
|
58
|
+
"""Override paint event to draw white checkmark when checked"""
|
|
59
|
+
super().paintEvent(a0)
|
|
60
|
+
|
|
61
|
+
if self.isChecked():
|
|
62
|
+
# Get the indicator rectangle using QStyle
|
|
63
|
+
opt = QStyleOptionButton()
|
|
64
|
+
self.initStyleOption(opt)
|
|
65
|
+
indicator_rect = self.style().subElementRect(
|
|
66
|
+
self.style().SubElement.SE_CheckBoxIndicator,
|
|
67
|
+
opt,
|
|
68
|
+
self
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
if indicator_rect.isValid():
|
|
72
|
+
# Draw white checkmark
|
|
73
|
+
painter = QPainter(self)
|
|
74
|
+
painter.setRenderHint(QPainter.RenderHint.Antialiasing)
|
|
75
|
+
# Slightly thinner pen for better fit on smaller displays
|
|
76
|
+
pen_width = max(2.0, min(indicator_rect.width(), indicator_rect.height()) * 0.12)
|
|
77
|
+
painter.setPen(QPen(QColor(255, 255, 255), pen_width, Qt.PenStyle.SolidLine, Qt.PenCapStyle.RoundCap, Qt.PenJoinStyle.RoundJoin))
|
|
78
|
+
painter.setBrush(QColor(255, 255, 255))
|
|
79
|
+
|
|
80
|
+
# Draw checkmark (✓ shape) - coordinates relative to indicator
|
|
81
|
+
# Add padding to prevent clipping on smaller displays
|
|
82
|
+
x = indicator_rect.x()
|
|
83
|
+
y = indicator_rect.y()
|
|
84
|
+
w = indicator_rect.width()
|
|
85
|
+
h = indicator_rect.height()
|
|
86
|
+
|
|
87
|
+
# Add padding (15% on all sides) to ensure checkmark doesn't get cut off on smaller displays
|
|
88
|
+
padding = min(w, h) * 0.15
|
|
89
|
+
x += padding
|
|
90
|
+
y += padding
|
|
91
|
+
w -= padding * 2
|
|
92
|
+
h -= padding * 2
|
|
93
|
+
|
|
94
|
+
# Checkmark path: bottom-left to middle, then middle to top-right
|
|
95
|
+
# Using proportions that create a nice checkmark shape with proper padding
|
|
96
|
+
check_x1 = x + w * 0.10 # Left point (more padding from left)
|
|
97
|
+
check_y1 = y + h * 0.50
|
|
98
|
+
check_x2 = x + w * 0.35 # Middle-bottom point
|
|
99
|
+
check_y2 = y + h * 0.70 # Bottom point (with padding from bottom)
|
|
100
|
+
check_x3 = x + w * 0.90 # Right point (more padding from right)
|
|
101
|
+
check_y3 = y + h * 0.25 # Top point (with padding from top)
|
|
102
|
+
|
|
103
|
+
# Draw two lines forming the checkmark
|
|
104
|
+
painter.drawLine(QPointF(check_x2, check_y2), QPointF(check_x3, check_y3))
|
|
105
|
+
painter.drawLine(QPointF(check_x1, check_y1), QPointF(check_x2, check_y2))
|
|
106
|
+
|
|
107
|
+
painter.end()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class PDFRescueQt:
|
|
111
|
+
"""
|
|
112
|
+
PDF Rescue feature - extract text from images using AI OCR
|
|
113
|
+
Can be embedded in any PyQt6 application as a tab or panel
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(self, parent_app, standalone=False):
|
|
117
|
+
"""
|
|
118
|
+
Initialize PDF Rescue module
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
parent_app: Reference to the main application (needs .load_api_keys() method or .api_keys attribute)
|
|
122
|
+
standalone: If True, running as standalone app. If False, embedded in Supervertaler
|
|
123
|
+
"""
|
|
124
|
+
self.parent_app = parent_app
|
|
125
|
+
self.standalone = standalone
|
|
126
|
+
self.clients = {} # Dictionary to store clients for different providers
|
|
127
|
+
self.image_files = []
|
|
128
|
+
self.extracted_texts = {}
|
|
129
|
+
|
|
130
|
+
# Load API keys for all providers
|
|
131
|
+
self.api_keys = {}
|
|
132
|
+
if hasattr(parent_app, 'load_api_keys'):
|
|
133
|
+
# Supervertaler_Qt style
|
|
134
|
+
self.api_keys = parent_app.load_api_keys()
|
|
135
|
+
elif hasattr(parent_app, 'api_keys'):
|
|
136
|
+
# Direct api_keys dict
|
|
137
|
+
self.api_keys = parent_app.api_keys
|
|
138
|
+
|
|
139
|
+
# Initialize clients for available providers
|
|
140
|
+
self._initialize_clients()
|
|
141
|
+
|
|
142
|
+
def _initialize_clients(self):
|
|
143
|
+
"""Initialize API clients for all available providers"""
|
|
144
|
+
# OpenAI
|
|
145
|
+
openai_key = self.api_keys.get('openai') or self.api_keys.get('openai_api_key')
|
|
146
|
+
if openai_key:
|
|
147
|
+
try:
|
|
148
|
+
from openai import OpenAI
|
|
149
|
+
self.clients['openai'] = OpenAI(api_key=openai_key)
|
|
150
|
+
self.log_message("✓ OpenAI client initialized")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
self.log_message(f"⚠ Failed to initialize OpenAI: {e}")
|
|
153
|
+
|
|
154
|
+
# Anthropic Claude
|
|
155
|
+
claude_key = self.api_keys.get('claude') or self.api_keys.get('anthropic')
|
|
156
|
+
if claude_key:
|
|
157
|
+
try:
|
|
158
|
+
import anthropic
|
|
159
|
+
self.clients['claude'] = anthropic.Anthropic(api_key=claude_key)
|
|
160
|
+
self.log_message("✓ Claude client initialized")
|
|
161
|
+
except ImportError:
|
|
162
|
+
self.log_message("⚠ Claude requested but 'anthropic' library not installed. Run: pip install anthropic")
|
|
163
|
+
except Exception as e:
|
|
164
|
+
self.log_message(f"⚠ Failed to initialize Claude: {e}")
|
|
165
|
+
|
|
166
|
+
# Google Gemini
|
|
167
|
+
gemini_key = self.api_keys.get('gemini') or self.api_keys.get('google')
|
|
168
|
+
if gemini_key:
|
|
169
|
+
try:
|
|
170
|
+
import google.generativeai as genai
|
|
171
|
+
genai.configure(api_key=gemini_key)
|
|
172
|
+
self.clients['gemini'] = genai
|
|
173
|
+
self.log_message("✓ Gemini client initialized")
|
|
174
|
+
except ImportError:
|
|
175
|
+
self.log_message("⚠ Gemini requested but 'google-generativeai' library not installed. Run: pip install google-generativeai")
|
|
176
|
+
except Exception as e:
|
|
177
|
+
self.log_message(f"⚠ Failed to initialize Gemini: {e}")
|
|
178
|
+
|
|
179
|
+
def log_message(self, message: str):
|
|
180
|
+
"""Log a message to the parent app's log if available"""
|
|
181
|
+
if hasattr(self.parent_app, 'log'):
|
|
182
|
+
self.parent_app.log(f"[PDF Rescue] {message}")
|
|
183
|
+
else:
|
|
184
|
+
print(f"[PDF Rescue] {message}")
|
|
185
|
+
|
|
186
|
+
def create_tab(self, parent):
|
|
187
|
+
"""
|
|
188
|
+
Create the PDF Rescue tab UI
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
parent: The parent widget (QWidget)
|
|
192
|
+
"""
|
|
193
|
+
# Save current state before recreating UI
|
|
194
|
+
saved_files = self.image_files.copy() if hasattr(self, 'image_files') else []
|
|
195
|
+
saved_texts = self.extracted_texts.copy() if hasattr(self, 'extracted_texts') else []
|
|
196
|
+
|
|
197
|
+
# Main layout
|
|
198
|
+
main_layout = QVBoxLayout(parent)
|
|
199
|
+
main_layout.setContentsMargins(10, 10, 10, 10)
|
|
200
|
+
main_layout.setSpacing(5) # Reduced from 10 to 5 for tighter spacing
|
|
201
|
+
|
|
202
|
+
# Header (matches Universal Lookup / AutoFingers style)
|
|
203
|
+
header = QLabel("🔍 PDF Rescue - AI-Powered OCR")
|
|
204
|
+
header.setStyleSheet("font-size: 16pt; font-weight: bold; color: #1976D2;")
|
|
205
|
+
main_layout.addWidget(header, 0) # 0 = no stretch, stays compact
|
|
206
|
+
|
|
207
|
+
# Description box (matches Universal Lookup / AutoFingers style)
|
|
208
|
+
description = QLabel(
|
|
209
|
+
"Extract text from image-based PDFs using AI vision OCR. Designed for scanned documents, screenshots, "
|
|
210
|
+
"and PDFs without accessible text. Not recommended for PDFs with selectable text - use professional tools like Adobe Acrobat instead."
|
|
211
|
+
)
|
|
212
|
+
description.setWordWrap(True)
|
|
213
|
+
description.setStyleSheet("color: #666; padding: 5px; background-color: #E3F2FD; border-radius: 3px;")
|
|
214
|
+
main_layout.addWidget(description, 0) # 0 = no stretch, stays compact
|
|
215
|
+
|
|
216
|
+
# Split view: Files on left, Preview on right
|
|
217
|
+
splitter = QSplitter(Qt.Orientation.Horizontal)
|
|
218
|
+
|
|
219
|
+
# LEFT: File list
|
|
220
|
+
left_widget = QWidget()
|
|
221
|
+
left_layout = QVBoxLayout(left_widget)
|
|
222
|
+
left_layout.setContentsMargins(3, 0, 3, 3) # Zero top margin to eliminate gap
|
|
223
|
+
left_layout.setSpacing(3)
|
|
224
|
+
|
|
225
|
+
files_label = QLabel("Images to Process")
|
|
226
|
+
files_label.setFont(QFont("Segoe UI", 9, QFont.Weight.Bold))
|
|
227
|
+
left_layout.addWidget(files_label)
|
|
228
|
+
|
|
229
|
+
# File list
|
|
230
|
+
self.file_listbox = QListWidget()
|
|
231
|
+
self.file_listbox.setFont(QFont("Consolas", 9))
|
|
232
|
+
self.file_listbox.itemSelectionChanged.connect(self._on_file_select)
|
|
233
|
+
left_layout.addWidget(self.file_listbox)
|
|
234
|
+
|
|
235
|
+
# Buttons
|
|
236
|
+
btn_layout = QHBoxLayout()
|
|
237
|
+
|
|
238
|
+
pdf_btn = QPushButton("📄 Import PDF")
|
|
239
|
+
pdf_btn.clicked.connect(self._import_from_pdf)
|
|
240
|
+
pdf_btn.setStyleSheet("background-color: #9C27B0; color: white; font-weight: bold; padding: 4px 8px;")
|
|
241
|
+
pdf_btn.setToolTip("Extract all pages from a PDF file and convert them to images for OCR processing")
|
|
242
|
+
btn_layout.addWidget(pdf_btn)
|
|
243
|
+
|
|
244
|
+
add_files_btn = QPushButton("➕ Add Image Files")
|
|
245
|
+
add_files_btn.clicked.connect(self._add_files)
|
|
246
|
+
add_files_btn.setStyleSheet("background-color: #2196F3; color: white; font-weight: bold; padding: 4px 8px;")
|
|
247
|
+
add_files_btn.setToolTip("Supported formats: .jpg, .jpeg, .png, .bmp, .gif, .tiff")
|
|
248
|
+
btn_layout.addWidget(add_files_btn)
|
|
249
|
+
|
|
250
|
+
add_folder_btn = QPushButton("📂 Folder")
|
|
251
|
+
add_folder_btn.clicked.connect(self._add_folder)
|
|
252
|
+
add_folder_btn.setStyleSheet("background-color: #2196F3; color: white; font-weight: bold; padding: 4px 8px;")
|
|
253
|
+
add_folder_btn.setToolTip("Add all image files from a selected folder")
|
|
254
|
+
btn_layout.addWidget(add_folder_btn)
|
|
255
|
+
|
|
256
|
+
clear_btn = QPushButton("Clear")
|
|
257
|
+
clear_btn.clicked.connect(self._clear_list)
|
|
258
|
+
clear_btn.setStyleSheet("background-color: #9E9E9E; color: white; padding: 4px 8px;")
|
|
259
|
+
clear_btn.setToolTip("Remove all files from the list")
|
|
260
|
+
btn_layout.addWidget(clear_btn)
|
|
261
|
+
|
|
262
|
+
left_layout.addLayout(btn_layout)
|
|
263
|
+
|
|
264
|
+
# Processing options (moved into left panel to eliminate wasted space)
|
|
265
|
+
options_group = QGroupBox("Processing Options")
|
|
266
|
+
options_layout = QVBoxLayout(options_group)
|
|
267
|
+
options_layout.setContentsMargins(8, 8, 8, 3) # Minimal bottom margin
|
|
268
|
+
options_layout.setSpacing(3) # Reduced spacing
|
|
269
|
+
|
|
270
|
+
# === MODEL SELECTOR ===
|
|
271
|
+
# Model selection and formatting option
|
|
272
|
+
model_layout = QHBoxLayout()
|
|
273
|
+
|
|
274
|
+
model_label = QLabel("AI Model:")
|
|
275
|
+
model_label.setFont(QFont("Segoe UI", 9))
|
|
276
|
+
model_layout.addWidget(model_label)
|
|
277
|
+
|
|
278
|
+
self.model_combo = QComboBox()
|
|
279
|
+
# Organize models by provider with separators
|
|
280
|
+
self.model_combo.addItem("--- OpenAI ---")
|
|
281
|
+
self.model_combo.addItems(["gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4", "gpt-5"])
|
|
282
|
+
self.model_combo.addItem("--- Claude (Anthropic) ---")
|
|
283
|
+
self.model_combo.addItems(["claude-3-5-sonnet-20241022", "claude-3-5-haiku-20241022", "claude-3-opus-20240229"])
|
|
284
|
+
self.model_combo.addItem("--- Gemini (Google) ---")
|
|
285
|
+
self.model_combo.addItems(["gemini-2.0-flash-exp", "gemini-1.5-pro-002", "gemini-1.5-flash-002"])
|
|
286
|
+
|
|
287
|
+
# Set default and style separator items
|
|
288
|
+
self.model_combo.setCurrentText("gpt-4o")
|
|
289
|
+
|
|
290
|
+
# Make separator items non-selectable by disabling them
|
|
291
|
+
combo_model = self.model_combo.model()
|
|
292
|
+
if isinstance(combo_model, QStandardItemModel):
|
|
293
|
+
for i in range(self.model_combo.count()):
|
|
294
|
+
if self.model_combo.itemText(i).startswith("---"):
|
|
295
|
+
item = combo_model.item(i)
|
|
296
|
+
if item:
|
|
297
|
+
item.setEnabled(False)
|
|
298
|
+
# Make separators visually distinct (gray, centered)
|
|
299
|
+
item.setFlags(Qt.ItemFlag.NoItemFlags)
|
|
300
|
+
|
|
301
|
+
self.model_combo.setEditable(False)
|
|
302
|
+
self.model_combo.setToolTip("Select AI model for vision OCR processing (OpenAI, Claude, or Gemini)")
|
|
303
|
+
model_layout.addWidget(self.model_combo)
|
|
304
|
+
|
|
305
|
+
model_layout.addSpacing(20)
|
|
306
|
+
|
|
307
|
+
self.preserve_formatting_check = CheckmarkCheckBox("Preserve formatting (bold/italic/underline)")
|
|
308
|
+
self.preserve_formatting_check.setChecked(True)
|
|
309
|
+
self.preserve_formatting_check.setFont(QFont("Segoe UI", 9))
|
|
310
|
+
self.preserve_formatting_check.setToolTip("When enabled, the AI will use markdown to preserve text formatting (bold, italic, underline)")
|
|
311
|
+
model_layout.addWidget(self.preserve_formatting_check)
|
|
312
|
+
|
|
313
|
+
model_layout.addStretch()
|
|
314
|
+
options_layout.addLayout(model_layout)
|
|
315
|
+
|
|
316
|
+
# Model descriptions (prominent display in left panel)
|
|
317
|
+
model_desc_label = QLabel("Model Capabilities:")
|
|
318
|
+
model_desc_label.setFont(QFont("Segoe UI", 9, QFont.Weight.Bold))
|
|
319
|
+
options_layout.addWidget(model_desc_label)
|
|
320
|
+
|
|
321
|
+
self.model_descriptions_text = QPlainTextEdit()
|
|
322
|
+
self.model_descriptions_text.setFont(QFont("Segoe UI", 9))
|
|
323
|
+
self.model_descriptions_text.setReadOnly(True)
|
|
324
|
+
# Size naturally to content - no fixed max height to eliminate wasted space below
|
|
325
|
+
model_descriptions = """• gpt-4o (Recommended): Fast and accurate; best balance of speed, quality, and cost; excellent for most documents including tables
|
|
326
|
+
• gpt-4o-mini: Fast and economical; good for simple documents; may struggle with complex layouts or tables
|
|
327
|
+
• gpt-4-turbo: Large context window (128k tokens); good for very long documents; slightly slower but handles extensive content well
|
|
328
|
+
• gpt-4: Classic, reliable baseline; consistent quality; good for standard documents, though slower than gpt-4o
|
|
329
|
+
• gpt-5 (Advanced Reasoning): Reasoning model; may improve table extraction and complex layouts; slower and more expensive; best for: complex tables, technical documents, structured data extraction"""
|
|
330
|
+
self.model_descriptions_text.setPlainText(model_descriptions)
|
|
331
|
+
options_layout.addWidget(self.model_descriptions_text)
|
|
332
|
+
|
|
333
|
+
left_layout.addWidget(options_group)
|
|
334
|
+
|
|
335
|
+
# Add stretch to push everything up and eliminate wasted space below
|
|
336
|
+
left_layout.addStretch()
|
|
337
|
+
|
|
338
|
+
splitter.addWidget(left_widget)
|
|
339
|
+
|
|
340
|
+
# RIGHT: Extraction Instructions and Text preview with vertical splitter
|
|
341
|
+
right_splitter = QSplitter(Qt.Orientation.Vertical)
|
|
342
|
+
|
|
343
|
+
# Top: Extraction Instructions
|
|
344
|
+
instructions_widget = QWidget()
|
|
345
|
+
instructions_widget_layout = QVBoxLayout(instructions_widget)
|
|
346
|
+
instructions_widget_layout.setContentsMargins(0, 0, 0, 0)
|
|
347
|
+
instructions_widget_layout.setSpacing(0)
|
|
348
|
+
|
|
349
|
+
instructions_group = QGroupBox("Extraction Instructions")
|
|
350
|
+
instructions_group_layout = QVBoxLayout(instructions_group)
|
|
351
|
+
instructions_group_layout.setContentsMargins(8, 8, 8, 8)
|
|
352
|
+
|
|
353
|
+
instructions_header = QHBoxLayout()
|
|
354
|
+
instructions_header.setContentsMargins(0, 0, 0, 5)
|
|
355
|
+
instructions_header.addStretch()
|
|
356
|
+
|
|
357
|
+
show_prompt_btn = QPushButton("👁️ Show Prompt")
|
|
358
|
+
show_prompt_btn.clicked.connect(self._show_full_prompt)
|
|
359
|
+
show_prompt_btn.setStyleSheet("background-color: #9C27B0; color: white; padding: 2px 8px;")
|
|
360
|
+
show_prompt_btn.setToolTip("Preview the exact prompt that will be sent to the AI model")
|
|
361
|
+
instructions_header.addWidget(show_prompt_btn)
|
|
362
|
+
|
|
363
|
+
instructions_group_layout.addLayout(instructions_header)
|
|
364
|
+
|
|
365
|
+
self.instructions_text = QPlainTextEdit()
|
|
366
|
+
self.instructions_text.setFont(QFont("Segoe UI", 9))
|
|
367
|
+
self.instructions_text.setMinimumHeight(60) # Reduced minimum height
|
|
368
|
+
default_instructions = """Extract all text from this image. The image is a screenshot from a poorly formatted PDF.
|
|
369
|
+
Please:
|
|
370
|
+
- Extract all visible text accurately
|
|
371
|
+
- Fix any obvious OCR errors or formatting issues
|
|
372
|
+
- Remove extraneous line breaks within paragraphs
|
|
373
|
+
- Preserve intentional paragraph breaks
|
|
374
|
+
- Maintain the logical flow and structure of the content
|
|
375
|
+
- For redacted/blacked-out text: insert a descriptive placeholder in square brackets in the document's language (e.g., [naam] for Dutch names, [name] for English names, [bedrag] for amounts, etc.)
|
|
376
|
+
- For stamps, signatures, or images: insert a descriptive placeholder in square brackets in the document's language (e.g., [handtekening], [stempel], [signature], [stamp], etc.)
|
|
377
|
+
- For any non-text elements that would normally appear: describe them briefly in square brackets
|
|
378
|
+
- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__
|
|
379
|
+
- Output clean, readable text only (no commentary)"""
|
|
380
|
+
self.instructions_text.setPlainText(default_instructions)
|
|
381
|
+
instructions_group_layout.addWidget(self.instructions_text)
|
|
382
|
+
|
|
383
|
+
instructions_widget_layout.addWidget(instructions_group)
|
|
384
|
+
right_splitter.addWidget(instructions_widget)
|
|
385
|
+
|
|
386
|
+
# Bottom: Extracted Text Preview
|
|
387
|
+
preview_widget = QWidget()
|
|
388
|
+
preview_widget_layout = QVBoxLayout(preview_widget)
|
|
389
|
+
preview_widget_layout.setContentsMargins(0, 0, 0, 0)
|
|
390
|
+
preview_widget_layout.setSpacing(0)
|
|
391
|
+
|
|
392
|
+
preview_label = QLabel("Extracted Text Preview")
|
|
393
|
+
preview_label.setFont(QFont("Segoe UI", 9, QFont.Weight.Bold))
|
|
394
|
+
preview_label.setContentsMargins(0, 5, 0, 5)
|
|
395
|
+
preview_widget_layout.addWidget(preview_label)
|
|
396
|
+
|
|
397
|
+
self.preview_text = QTextEdit()
|
|
398
|
+
self.preview_text.setFont(QFont("Segoe UI", 9))
|
|
399
|
+
self.preview_text.setReadOnly(False) # Allow editing
|
|
400
|
+
self.preview_text.setWordWrapMode(QTextOption.WrapMode.WordWrap)
|
|
401
|
+
preview_widget_layout.addWidget(self.preview_text)
|
|
402
|
+
|
|
403
|
+
right_splitter.addWidget(preview_widget)
|
|
404
|
+
|
|
405
|
+
# Set initial splitter sizes for right panel (30% instructions, 70% preview)
|
|
406
|
+
right_splitter.setSizes([150, 350])
|
|
407
|
+
|
|
408
|
+
splitter.addWidget(right_splitter)
|
|
409
|
+
|
|
410
|
+
# Set splitter sizes (1:2 ratio)
|
|
411
|
+
splitter.setSizes([300, 600])
|
|
412
|
+
|
|
413
|
+
main_layout.addWidget(splitter, 1) # 1 = stretch factor, expands to fill space
|
|
414
|
+
|
|
415
|
+
# Action buttons
|
|
416
|
+
action_layout = QHBoxLayout()
|
|
417
|
+
action_layout.setSpacing(5)
|
|
418
|
+
|
|
419
|
+
process_selected_btn = QPushButton("🔍 Process Selected")
|
|
420
|
+
process_selected_btn.clicked.connect(self._process_selected)
|
|
421
|
+
process_selected_btn.setStyleSheet("background-color: #FF9800; color: white; font-weight: bold; padding: 6px 15px;")
|
|
422
|
+
process_selected_btn.setToolTip("Process the currently selected image with AI OCR to extract text")
|
|
423
|
+
action_layout.addWidget(process_selected_btn)
|
|
424
|
+
|
|
425
|
+
process_all_btn = QPushButton("⚡ Process ALL")
|
|
426
|
+
process_all_btn.clicked.connect(self._process_all)
|
|
427
|
+
process_all_btn.setStyleSheet("background-color: #4CAF50; color: white; font-weight: bold; padding: 6px 15px;")
|
|
428
|
+
process_all_btn.setToolTip("Process all images in the list with AI OCR. This will use API credits and may take several minutes.")
|
|
429
|
+
action_layout.addWidget(process_all_btn)
|
|
430
|
+
|
|
431
|
+
save_docx_btn = QPushButton("💾 Export Markdown && Word")
|
|
432
|
+
save_docx_btn.clicked.connect(self._export_markdown_and_word)
|
|
433
|
+
save_docx_btn.setStyleSheet("background-color: #2196F3; color: white; font-weight: bold; padding: 6px 15px;")
|
|
434
|
+
save_docx_btn.setToolTip("Export extracted text as Markdown (.md), Word document (.docx), and session report")
|
|
435
|
+
action_layout.addWidget(save_docx_btn)
|
|
436
|
+
|
|
437
|
+
copy_all_btn = QPushButton("📋 Copy All")
|
|
438
|
+
copy_all_btn.clicked.connect(self._copy_all_text)
|
|
439
|
+
copy_all_btn.setStyleSheet("background-color: #607D8B; color: white; font-weight: bold; padding: 6px 15px;")
|
|
440
|
+
copy_all_btn.setToolTip("Copy all extracted text from all processed images to clipboard")
|
|
441
|
+
action_layout.addWidget(copy_all_btn)
|
|
442
|
+
|
|
443
|
+
session_report_btn = QPushButton("📊 Session Report")
|
|
444
|
+
session_report_btn.clicked.connect(self._save_session_report)
|
|
445
|
+
session_report_btn.setStyleSheet("background-color: #795548; color: white; font-weight: bold; padding: 6px 15px;")
|
|
446
|
+
session_report_btn.setToolTip("Generate and save a detailed session report in Markdown format with statistics and extracted text")
|
|
447
|
+
action_layout.addWidget(session_report_btn)
|
|
448
|
+
|
|
449
|
+
action_layout.addStretch()
|
|
450
|
+
main_layout.addLayout(action_layout, 0) # 0 = no stretch, stays compact
|
|
451
|
+
|
|
452
|
+
# Status
|
|
453
|
+
self.status_label = QLabel("Ready - Add images to begin")
|
|
454
|
+
self.status_label.setFont(QFont("Segoe UI", 9))
|
|
455
|
+
self.status_label.setStyleSheet("color: #666;")
|
|
456
|
+
main_layout.addWidget(self.status_label, 0) # 0 = no stretch, stays compact
|
|
457
|
+
|
|
458
|
+
# Progress bar
|
|
459
|
+
self.progress = QProgressBar()
|
|
460
|
+
self.progress.setMinimum(0)
|
|
461
|
+
self.progress.setMaximum(100)
|
|
462
|
+
self.progress.setValue(0)
|
|
463
|
+
main_layout.addWidget(self.progress, 0) # 0 = no stretch, stays compact
|
|
464
|
+
|
|
465
|
+
# Restore state after UI creation
|
|
466
|
+
self.image_files = saved_files
|
|
467
|
+
self.extracted_texts = saved_texts
|
|
468
|
+
if self.image_files:
|
|
469
|
+
self._update_listbox()
|
|
470
|
+
|
|
471
|
+
# === File Management Methods ===
|
|
472
|
+
|
|
473
|
+
def _import_from_pdf(self):
|
|
474
|
+
"""Import images directly from a PDF file (simple OCR-only version)"""
|
|
475
|
+
pdf_file, _ = QFileDialog.getOpenFileName(
|
|
476
|
+
parent=None,
|
|
477
|
+
caption="Select PDF File",
|
|
478
|
+
filter="PDF files (*.pdf);;All files (*.*)"
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
if not pdf_file:
|
|
482
|
+
return
|
|
483
|
+
|
|
484
|
+
try:
|
|
485
|
+
# Open PDF
|
|
486
|
+
doc = fitz.open(pdf_file)
|
|
487
|
+
total_pages = len(doc)
|
|
488
|
+
|
|
489
|
+
if total_pages == 0:
|
|
490
|
+
QMessageBox.warning(None, "Empty PDF", "The selected PDF has no pages.")
|
|
491
|
+
return
|
|
492
|
+
|
|
493
|
+
# Create folder for extracted images next to the PDF
|
|
494
|
+
pdf_path = Path(pdf_file)
|
|
495
|
+
pdf_name = pdf_path.stem
|
|
496
|
+
images_folder = pdf_path.parent / f"{pdf_name}_images"
|
|
497
|
+
|
|
498
|
+
# Create folder if it doesn't exist
|
|
499
|
+
images_folder.mkdir(exist_ok=True)
|
|
500
|
+
temp_dir = str(images_folder)
|
|
501
|
+
|
|
502
|
+
# Log start
|
|
503
|
+
self.log_message(f"Starting PDF import: {pdf_path.name}")
|
|
504
|
+
self.log_message(f"Total pages: {total_pages}")
|
|
505
|
+
|
|
506
|
+
# Extract each page as an image
|
|
507
|
+
extracted_count = 0
|
|
508
|
+
self.status_label.setText(f"Extracting pages from PDF...")
|
|
509
|
+
QApplication.processEvents()
|
|
510
|
+
|
|
511
|
+
for page_num in range(total_pages):
|
|
512
|
+
page = doc[page_num]
|
|
513
|
+
|
|
514
|
+
# Render page to pixmap (image) at 2x resolution for better quality
|
|
515
|
+
zoom = 2.0
|
|
516
|
+
mat = fitz.Matrix(zoom, zoom)
|
|
517
|
+
pix = page.get_pixmap(matrix=mat)
|
|
518
|
+
|
|
519
|
+
# Save as PNG
|
|
520
|
+
img_filename = f"{pdf_name}_page_{page_num + 1:03d}.png"
|
|
521
|
+
img_path = os.path.join(temp_dir, img_filename)
|
|
522
|
+
pix.save(img_path)
|
|
523
|
+
|
|
524
|
+
# Add to image list
|
|
525
|
+
if img_path not in self.image_files:
|
|
526
|
+
self.image_files.append(img_path)
|
|
527
|
+
extracted_count += 1
|
|
528
|
+
|
|
529
|
+
# Log each page
|
|
530
|
+
self.log_message(f" Page {page_num + 1}/{total_pages} extracted: {img_filename}")
|
|
531
|
+
|
|
532
|
+
# Update progress
|
|
533
|
+
self.status_label.setText(
|
|
534
|
+
f"Extracting page {page_num + 1}/{total_pages}..."
|
|
535
|
+
)
|
|
536
|
+
self.progress.setValue(page_num + 1)
|
|
537
|
+
self.progress.setMaximum(total_pages)
|
|
538
|
+
QApplication.processEvents()
|
|
539
|
+
|
|
540
|
+
doc.close()
|
|
541
|
+
|
|
542
|
+
# Update list
|
|
543
|
+
self._update_listbox()
|
|
544
|
+
|
|
545
|
+
# Log completion
|
|
546
|
+
self.log_message(f"PDF import complete: {extracted_count} pages extracted to {images_folder}")
|
|
547
|
+
self.status_label.setText(f"✓ Imported {extracted_count} pages from PDF")
|
|
548
|
+
|
|
549
|
+
QMessageBox.information(
|
|
550
|
+
None,
|
|
551
|
+
"PDF Import Complete",
|
|
552
|
+
f"Successfully extracted {extracted_count} pages from PDF!\n\n"
|
|
553
|
+
f"Images saved to:\n{images_folder}\n\n"
|
|
554
|
+
f"You can now process them with AI OCR."
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
except Exception as e:
|
|
558
|
+
QMessageBox.critical(None, "PDF Import Error", f"Failed to import PDF:\n\n{str(e)}")
|
|
559
|
+
self.log_message(f"ERROR importing PDF: {str(e)}")
|
|
560
|
+
self.status_label.setText("PDF import failed")
|
|
561
|
+
|
|
562
|
+
def _add_files(self):
|
|
563
|
+
"""Add individual image files"""
|
|
564
|
+
files, _ = QFileDialog.getOpenFileNames(
|
|
565
|
+
parent=None,
|
|
566
|
+
caption="Select Image Files",
|
|
567
|
+
filter="Image files (*.jpg *.jpeg *.png *.bmp *.gif *.tiff);;All files (*.*)"
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
if files:
|
|
571
|
+
for file in files:
|
|
572
|
+
if file not in self.image_files:
|
|
573
|
+
self.image_files.append(file)
|
|
574
|
+
self._update_listbox()
|
|
575
|
+
self.status_label.setText(f"Added {len(files)} file(s)")
|
|
576
|
+
self.log_message(f"Added {len(files)} image file(s)")
|
|
577
|
+
|
|
578
|
+
def _add_folder(self):
|
|
579
|
+
"""Add all images from a folder"""
|
|
580
|
+
folder = QFileDialog.getExistingDirectory(
|
|
581
|
+
parent=None,
|
|
582
|
+
caption="Select Folder with Images"
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
if folder:
|
|
586
|
+
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}
|
|
587
|
+
files = []
|
|
588
|
+
|
|
589
|
+
for file in sorted(os.listdir(folder)):
|
|
590
|
+
file_path = os.path.join(folder, file)
|
|
591
|
+
if os.path.isfile(file_path):
|
|
592
|
+
ext = os.path.splitext(file)[1].lower()
|
|
593
|
+
if ext in image_extensions and file_path not in self.image_files:
|
|
594
|
+
files.append(file_path)
|
|
595
|
+
|
|
596
|
+
self.image_files.extend(files)
|
|
597
|
+
self._update_listbox()
|
|
598
|
+
self.status_label.setText(f"Added {len(files)} file(s) from folder")
|
|
599
|
+
self.log_message(f"Added {len(files)} file(s) from folder: {folder}")
|
|
600
|
+
|
|
601
|
+
def _clear_list(self):
|
|
602
|
+
"""Clear all files"""
|
|
603
|
+
if self.image_files:
|
|
604
|
+
reply = QMessageBox.question(
|
|
605
|
+
None,
|
|
606
|
+
"Clear",
|
|
607
|
+
"Remove all files?",
|
|
608
|
+
QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No
|
|
609
|
+
)
|
|
610
|
+
if reply == QMessageBox.StandardButton.Yes:
|
|
611
|
+
self.image_files = []
|
|
612
|
+
self.extracted_texts = {}
|
|
613
|
+
self._update_listbox()
|
|
614
|
+
self.preview_text.clear()
|
|
615
|
+
self.status_label.setText("List cleared")
|
|
616
|
+
|
|
617
|
+
def _update_listbox(self):
|
|
618
|
+
"""Update file listbox with checkmarks for processed items"""
|
|
619
|
+
self.file_listbox.clear()
|
|
620
|
+
for i, file in enumerate(self.image_files, 1):
|
|
621
|
+
filename = os.path.basename(file)
|
|
622
|
+
status = "✓ " if file in self.extracted_texts else ""
|
|
623
|
+
self.file_listbox.addItem(f"{status}{i:2d}. {filename}")
|
|
624
|
+
|
|
625
|
+
def _on_file_select(self):
|
|
626
|
+
"""Show extracted text when file is selected"""
|
|
627
|
+
selected_items = self.file_listbox.selectedItems()
|
|
628
|
+
if not selected_items:
|
|
629
|
+
return
|
|
630
|
+
|
|
631
|
+
idx = self.file_listbox.row(selected_items[0])
|
|
632
|
+
if idx < len(self.image_files):
|
|
633
|
+
file = self.image_files[idx]
|
|
634
|
+
if file in self.extracted_texts:
|
|
635
|
+
self.preview_text.setPlainText(self.extracted_texts[file])
|
|
636
|
+
|
|
637
|
+
def _show_full_prompt(self):
|
|
638
|
+
"""Show the exact prompt that will be sent to the AI"""
|
|
639
|
+
instructions = self.instructions_text.toPlainText().strip()
|
|
640
|
+
|
|
641
|
+
# Apply formatting modifications like in _extract_text_from_image
|
|
642
|
+
if self.preserve_formatting_check.isChecked():
|
|
643
|
+
if "markdown for text formatting" not in instructions:
|
|
644
|
+
instructions += "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__"
|
|
645
|
+
else:
|
|
646
|
+
instructions = instructions.replace(
|
|
647
|
+
"\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
|
|
648
|
+
).replace(
|
|
649
|
+
"- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
# Create popup dialog
|
|
653
|
+
popup = QDialog()
|
|
654
|
+
popup.setWindowTitle("Full Prompt Preview")
|
|
655
|
+
popup.resize(700, 600)
|
|
656
|
+
|
|
657
|
+
layout = QVBoxLayout(popup)
|
|
658
|
+
|
|
659
|
+
# Title
|
|
660
|
+
title = QLabel("Exact Prompt Sent to OpenAI API")
|
|
661
|
+
title.setFont(QFont("Segoe UI", 12, QFont.Weight.Bold))
|
|
662
|
+
layout.addWidget(title)
|
|
663
|
+
|
|
664
|
+
# Info frame
|
|
665
|
+
info_group = QGroupBox("Configuration")
|
|
666
|
+
info_layout = QVBoxLayout(info_group)
|
|
667
|
+
|
|
668
|
+
model_label = QLabel(f"Model: {self.model_combo.currentText()}")
|
|
669
|
+
model_label.setFont(QFont("Segoe UI", 9, QFont.Weight.Bold))
|
|
670
|
+
info_layout.addWidget(model_label)
|
|
671
|
+
|
|
672
|
+
formatting_status = "✓ Enabled" if self.preserve_formatting_check.isChecked() else "✗ Disabled"
|
|
673
|
+
formatting_label = QLabel(f"Formatting Preservation: {formatting_status}")
|
|
674
|
+
formatting_label.setFont(QFont("Segoe UI", 9))
|
|
675
|
+
info_layout.addWidget(formatting_label)
|
|
676
|
+
|
|
677
|
+
tokens_label = QLabel("Max Tokens: 4000")
|
|
678
|
+
tokens_label.setFont(QFont("Segoe UI", 9))
|
|
679
|
+
info_layout.addWidget(tokens_label)
|
|
680
|
+
|
|
681
|
+
layout.addWidget(info_group)
|
|
682
|
+
|
|
683
|
+
# Prompt text
|
|
684
|
+
prompt_group = QGroupBox("Full Instructions Text")
|
|
685
|
+
prompt_layout = QVBoxLayout(prompt_group)
|
|
686
|
+
|
|
687
|
+
prompt_text = QPlainTextEdit()
|
|
688
|
+
prompt_text.setFont(QFont("Consolas", 9))
|
|
689
|
+
prompt_text.setPlainText(instructions)
|
|
690
|
+
prompt_text.setReadOnly(True)
|
|
691
|
+
prompt_layout.addWidget(prompt_text)
|
|
692
|
+
|
|
693
|
+
layout.addWidget(prompt_group)
|
|
694
|
+
|
|
695
|
+
# Note
|
|
696
|
+
note = QLabel(
|
|
697
|
+
"Note: The image is sent as base64-encoded data along with these instructions."
|
|
698
|
+
)
|
|
699
|
+
note.setStyleSheet("color: #666;")
|
|
700
|
+
note.setFont(QFont("Segoe UI", 8))
|
|
701
|
+
layout.addWidget(note)
|
|
702
|
+
|
|
703
|
+
# Close button
|
|
704
|
+
button_box = QDialogButtonBox(QDialogButtonBox.StandardButton.Close)
|
|
705
|
+
button_box.rejected.connect(popup.close)
|
|
706
|
+
layout.addWidget(button_box)
|
|
707
|
+
|
|
708
|
+
popup.exec()
|
|
709
|
+
|
|
710
|
+
def _save_session_report(self):
|
|
711
|
+
"""Generate and save a session report in markdown format"""
|
|
712
|
+
if not self.extracted_texts:
|
|
713
|
+
QMessageBox.warning(
|
|
714
|
+
None,
|
|
715
|
+
"No Data",
|
|
716
|
+
"No OCR processing has been performed yet.\n\n"
|
|
717
|
+
"Process some images first to generate a session report."
|
|
718
|
+
)
|
|
719
|
+
return
|
|
720
|
+
|
|
721
|
+
# Ask for save location
|
|
722
|
+
output_file, _ = QFileDialog.getSaveFileName(
|
|
723
|
+
parent=None,
|
|
724
|
+
caption="Save Session Report",
|
|
725
|
+
filter="Markdown files (*.md);;Text files (*.txt);;All files (*.*)",
|
|
726
|
+
initialFilter="Markdown files (*.md)"
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
if not output_file:
|
|
730
|
+
return
|
|
731
|
+
|
|
732
|
+
try:
|
|
733
|
+
from datetime import datetime
|
|
734
|
+
|
|
735
|
+
# Generate report content
|
|
736
|
+
report_lines = []
|
|
737
|
+
report_lines.append("# PDF Rescue - Session Report\n")
|
|
738
|
+
report_lines.append("**Generated by [Supervertaler](https://supervertaler.com/) • by Michael Beijer**\n\n")
|
|
739
|
+
report_lines.append(f"**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
740
|
+
report_lines.append("---\n\n")
|
|
741
|
+
|
|
742
|
+
# Configuration section
|
|
743
|
+
report_lines.append("## Configuration\n\n")
|
|
744
|
+
report_lines.append(f"- **Model**: {self.model_combo.currentText()}\n")
|
|
745
|
+
formatting_status = "Enabled ✓" if self.preserve_formatting_check.isChecked() else "Disabled ✗"
|
|
746
|
+
report_lines.append(f"- **Formatting Preservation**: {formatting_status}\n")
|
|
747
|
+
report_lines.append(f"- **Total Images Processed**: {len(self.extracted_texts)}\n")
|
|
748
|
+
report_lines.append(f"- **Total Images in List**: {len(self.image_files)}\n\n")
|
|
749
|
+
|
|
750
|
+
# Instructions used
|
|
751
|
+
report_lines.append("## Extraction Instructions\n\n")
|
|
752
|
+
report_lines.append("```\n")
|
|
753
|
+
instructions = self.instructions_text.toPlainText().strip()
|
|
754
|
+
if self.preserve_formatting_check.isChecked():
|
|
755
|
+
if "markdown for text formatting" not in instructions:
|
|
756
|
+
instructions += "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__"
|
|
757
|
+
report_lines.append(instructions)
|
|
758
|
+
report_lines.append("\n```\n\n")
|
|
759
|
+
|
|
760
|
+
# Processing summary
|
|
761
|
+
report_lines.append("## Processing Summary\n\n")
|
|
762
|
+
report_lines.append("| # | Image File | Status |\n")
|
|
763
|
+
report_lines.append("|---|------------|--------|\n")
|
|
764
|
+
|
|
765
|
+
for i, file in enumerate(self.image_files, 1):
|
|
766
|
+
filename = os.path.basename(file)
|
|
767
|
+
status = "✓ Processed" if file in self.extracted_texts else "⧗ Pending"
|
|
768
|
+
report_lines.append(f"| {i} | {filename} | {status} |\n")
|
|
769
|
+
|
|
770
|
+
report_lines.append("\n---\n\n")
|
|
771
|
+
|
|
772
|
+
# Extracted text for each image
|
|
773
|
+
report_lines.append("## Extracted Text\n\n")
|
|
774
|
+
|
|
775
|
+
for i, file in enumerate(self.image_files, 1):
|
|
776
|
+
if file in self.extracted_texts:
|
|
777
|
+
filename = os.path.basename(file)
|
|
778
|
+
report_lines.append(f"### Page {i}: {filename}\n\n")
|
|
779
|
+
report_lines.append("```\n")
|
|
780
|
+
report_lines.append(self.extracted_texts[file])
|
|
781
|
+
report_lines.append("\n```\n\n")
|
|
782
|
+
report_lines.append("---\n\n")
|
|
783
|
+
|
|
784
|
+
# Statistics
|
|
785
|
+
report_lines.append("## Statistics\n\n")
|
|
786
|
+
texts_list = list(self.extracted_texts.values())
|
|
787
|
+
total_chars = sum(len(text) for text in texts_list)
|
|
788
|
+
total_words = sum(len(text.split()) for text in texts_list)
|
|
789
|
+
report_lines.append(f"- **Total Characters Extracted**: {total_chars:,}\n")
|
|
790
|
+
report_lines.append(f"- **Total Words Extracted**: {total_words:,}\n")
|
|
791
|
+
report_lines.append(f"- **Average Characters per Page**: {total_chars // len(self.extracted_texts) if self.extracted_texts else 0:,}\n")
|
|
792
|
+
report_lines.append(f"- **Average Words per Page**: {total_words // len(self.extracted_texts) if self.extracted_texts else 0:,}\n\n")
|
|
793
|
+
|
|
794
|
+
# Footer
|
|
795
|
+
report_lines.append("---\n\n")
|
|
796
|
+
report_lines.append("*Report generated by **PDF Rescue** - AI-Powered OCR Tool*\n\n")
|
|
797
|
+
report_lines.append("*Part of [**Supervertaler**](https://supervertaler.com/) • by Michael Beijer*\n")
|
|
798
|
+
|
|
799
|
+
# Write to file
|
|
800
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
801
|
+
f.writelines(report_lines)
|
|
802
|
+
|
|
803
|
+
self.log_message(f"Session report saved: {Path(output_file).name}")
|
|
804
|
+
self.status_label.setText(f"✓ Report saved to {os.path.basename(output_file)}")
|
|
805
|
+
|
|
806
|
+
reply = QMessageBox.question(
|
|
807
|
+
None,
|
|
808
|
+
"Success",
|
|
809
|
+
f"Session report saved successfully!\n\n"
|
|
810
|
+
f"File: {Path(output_file).name}\n\n"
|
|
811
|
+
"Open the report now?",
|
|
812
|
+
QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No
|
|
813
|
+
)
|
|
814
|
+
if reply == QMessageBox.StandardButton.Yes:
|
|
815
|
+
os.startfile(output_file)
|
|
816
|
+
|
|
817
|
+
except Exception as e:
|
|
818
|
+
QMessageBox.critical(None, "Error", f"Failed to save report:\n\n{str(e)}")
|
|
819
|
+
|
|
820
|
+
# === OCR Processing Methods ===
|
|
821
|
+
|
|
822
|
+
def _encode_image(self, image_path):
|
|
823
|
+
"""Encode image to base64"""
|
|
824
|
+
with open(image_path, "rb") as image_file:
|
|
825
|
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
|
826
|
+
|
|
827
|
+
def _extract_text_from_image(self, image_path):
|
|
828
|
+
"""Use AI Vision to extract text from image (supports OpenAI, Claude, Gemini)"""
|
|
829
|
+
try:
|
|
830
|
+
instructions = self.instructions_text.toPlainText().strip()
|
|
831
|
+
|
|
832
|
+
# Add or remove formatting instruction based on checkbox
|
|
833
|
+
if self.preserve_formatting_check.isChecked():
|
|
834
|
+
if "markdown for text formatting" not in instructions:
|
|
835
|
+
instructions += "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__"
|
|
836
|
+
else:
|
|
837
|
+
# Remove markdown instruction if present
|
|
838
|
+
instructions = instructions.replace(
|
|
839
|
+
"\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
|
|
840
|
+
).replace(
|
|
841
|
+
"- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__", ""
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
# Get selected model
|
|
845
|
+
model = self.model_combo.currentText()
|
|
846
|
+
|
|
847
|
+
# Skip separator items
|
|
848
|
+
if model.startswith("---"):
|
|
849
|
+
return "[ERROR: Please select a valid model, not a separator]"
|
|
850
|
+
|
|
851
|
+
# Determine provider from model name
|
|
852
|
+
provider = self._get_provider_from_model(model)
|
|
853
|
+
|
|
854
|
+
if provider not in self.clients:
|
|
855
|
+
return f"[ERROR: {provider.title()} client not initialized. Check API key in api_keys.txt]"
|
|
856
|
+
|
|
857
|
+
# Call appropriate provider
|
|
858
|
+
if provider == "openai":
|
|
859
|
+
return self._extract_with_openai(image_path, model, instructions)
|
|
860
|
+
elif provider == "claude":
|
|
861
|
+
return self._extract_with_claude(image_path, model, instructions)
|
|
862
|
+
elif provider == "gemini":
|
|
863
|
+
return self._extract_with_gemini(image_path, model, instructions)
|
|
864
|
+
else:
|
|
865
|
+
return f"[ERROR: Unknown provider: {provider}]"
|
|
866
|
+
|
|
867
|
+
except Exception as e:
|
|
868
|
+
return f"[ERROR extracting text: {str(e)}]"
|
|
869
|
+
|
|
870
|
+
def _get_provider_from_model(self, model):
|
|
871
|
+
"""Determine provider from model name"""
|
|
872
|
+
if model.startswith("gpt") or model.startswith("o1") or model.startswith("o3"):
|
|
873
|
+
return "openai"
|
|
874
|
+
elif model.startswith("claude"):
|
|
875
|
+
return "claude"
|
|
876
|
+
elif model.startswith("gemini"):
|
|
877
|
+
return "gemini"
|
|
878
|
+
else:
|
|
879
|
+
return "unknown"
|
|
880
|
+
|
|
881
|
+
def _extract_with_openai(self, image_path, model, instructions):
|
|
882
|
+
"""Extract text using OpenAI Vision API"""
|
|
883
|
+
base64_image = self._encode_image(image_path)
|
|
884
|
+
|
|
885
|
+
api_params = {
|
|
886
|
+
"model": model,
|
|
887
|
+
"messages": [
|
|
888
|
+
{
|
|
889
|
+
"role": "user",
|
|
890
|
+
"content": [
|
|
891
|
+
{"type": "text", "text": instructions},
|
|
892
|
+
{
|
|
893
|
+
"type": "image_url",
|
|
894
|
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
|
|
895
|
+
}
|
|
896
|
+
]
|
|
897
|
+
}
|
|
898
|
+
]
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
# Use appropriate token parameter based on model
|
|
902
|
+
if model.startswith("gpt-5") or model.startswith("o1"):
|
|
903
|
+
api_params["max_completion_tokens"] = 4000
|
|
904
|
+
else:
|
|
905
|
+
api_params["max_tokens"] = 4000
|
|
906
|
+
|
|
907
|
+
response = self.clients['openai'].chat.completions.create(**api_params)
|
|
908
|
+
return response.choices[0].message.content
|
|
909
|
+
|
|
910
|
+
def _extract_with_claude(self, image_path, model, instructions):
|
|
911
|
+
"""Extract text using Claude Vision API"""
|
|
912
|
+
import base64
|
|
913
|
+
|
|
914
|
+
# Read image and encode to base64
|
|
915
|
+
with open(image_path, "rb") as image_file:
|
|
916
|
+
image_data = base64.standard_b64encode(image_file.read()).decode("utf-8")
|
|
917
|
+
|
|
918
|
+
# Determine media type from file extension
|
|
919
|
+
ext = os.path.splitext(image_path)[1].lower()
|
|
920
|
+
media_type_map = {
|
|
921
|
+
'.jpg': 'image/jpeg',
|
|
922
|
+
'.jpeg': 'image/jpeg',
|
|
923
|
+
'.png': 'image/png',
|
|
924
|
+
'.gif': 'image/gif',
|
|
925
|
+
'.webp': 'image/webp'
|
|
926
|
+
}
|
|
927
|
+
media_type = media_type_map.get(ext, 'image/jpeg')
|
|
928
|
+
|
|
929
|
+
response = self.clients['claude'].messages.create(
|
|
930
|
+
model=model,
|
|
931
|
+
max_tokens=4000,
|
|
932
|
+
messages=[
|
|
933
|
+
{
|
|
934
|
+
"role": "user",
|
|
935
|
+
"content": [
|
|
936
|
+
{
|
|
937
|
+
"type": "image",
|
|
938
|
+
"source": {
|
|
939
|
+
"type": "base64",
|
|
940
|
+
"media_type": media_type,
|
|
941
|
+
"data": image_data,
|
|
942
|
+
},
|
|
943
|
+
},
|
|
944
|
+
{
|
|
945
|
+
"type": "text",
|
|
946
|
+
"text": instructions
|
|
947
|
+
}
|
|
948
|
+
],
|
|
949
|
+
}
|
|
950
|
+
],
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
return response.content[0].text
|
|
954
|
+
|
|
955
|
+
def _extract_with_gemini(self, image_path, model, instructions):
|
|
956
|
+
"""Extract text using Gemini Vision API"""
|
|
957
|
+
from PIL import Image
|
|
958
|
+
|
|
959
|
+
# Load image
|
|
960
|
+
img = Image.open(image_path)
|
|
961
|
+
|
|
962
|
+
# Create model instance
|
|
963
|
+
gemini_model = self.clients['gemini'].GenerativeModel(model)
|
|
964
|
+
|
|
965
|
+
# Generate content with image and prompt
|
|
966
|
+
response = gemini_model.generate_content([instructions, img])
|
|
967
|
+
|
|
968
|
+
return response.text
|
|
969
|
+
|
|
970
|
+
def _process_selected(self):
|
|
971
|
+
"""Process currently selected image"""
|
|
972
|
+
selected_items = self.file_listbox.selectedItems()
|
|
973
|
+
if not selected_items:
|
|
974
|
+
QMessageBox.warning(None, "No Selection", "Please select an image to process")
|
|
975
|
+
return
|
|
976
|
+
|
|
977
|
+
idx = self.file_listbox.row(selected_items[0])
|
|
978
|
+
if idx >= len(self.image_files):
|
|
979
|
+
return
|
|
980
|
+
|
|
981
|
+
file = self.image_files[idx]
|
|
982
|
+
filename = os.path.basename(file)
|
|
983
|
+
|
|
984
|
+
# Get provider info for status display
|
|
985
|
+
model = self.model_combo.currentText()
|
|
986
|
+
provider = self._get_provider_from_model(model).title()
|
|
987
|
+
|
|
988
|
+
self.log_message(f"Processing selected image with {provider} ({model}): {filename}")
|
|
989
|
+
self.status_label.setText(f"Processing with {provider}... {filename}")
|
|
990
|
+
QApplication.processEvents()
|
|
991
|
+
|
|
992
|
+
text = self._extract_text_from_image(file)
|
|
993
|
+
self.extracted_texts[file] = text
|
|
994
|
+
|
|
995
|
+
self.preview_text.setPlainText(text)
|
|
996
|
+
|
|
997
|
+
self._update_listbox()
|
|
998
|
+
self.log_message(f"Successfully processed: {filename}")
|
|
999
|
+
self.status_label.setText(f"✓ Processed {filename} with {provider}")
|
|
1000
|
+
|
|
1001
|
+
def _process_all(self):
|
|
1002
|
+
"""Process all images in the list"""
|
|
1003
|
+
if not self.image_files:
|
|
1004
|
+
QMessageBox.warning(None, "No Files", "Please add images first")
|
|
1005
|
+
return
|
|
1006
|
+
|
|
1007
|
+
# Get provider info for confirmation
|
|
1008
|
+
model = self.model_combo.currentText()
|
|
1009
|
+
provider = self._get_provider_from_model(model).title()
|
|
1010
|
+
|
|
1011
|
+
reply = QMessageBox.question(
|
|
1012
|
+
None,
|
|
1013
|
+
"Process All",
|
|
1014
|
+
f"Process all {len(self.image_files)} images with {provider} ({model})?\n\n"
|
|
1015
|
+
"This will use API credits and may take several minutes.",
|
|
1016
|
+
QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No
|
|
1017
|
+
)
|
|
1018
|
+
if reply != QMessageBox.StandardButton.Yes:
|
|
1019
|
+
return
|
|
1020
|
+
|
|
1021
|
+
self.log_message(f"Starting batch processing with {provider}: {len(self.image_files)} images")
|
|
1022
|
+
self.progress.setMaximum(len(self.image_files))
|
|
1023
|
+
self.progress.setValue(0)
|
|
1024
|
+
|
|
1025
|
+
for i, file in enumerate(self.image_files, 1):
|
|
1026
|
+
filename = os.path.basename(file)
|
|
1027
|
+
self.status_label.setText(f"[{provider}] Processing {i}/{len(self.image_files)}: {filename}...")
|
|
1028
|
+
QApplication.processEvents()
|
|
1029
|
+
|
|
1030
|
+
if file not in self.extracted_texts:
|
|
1031
|
+
text = self._extract_text_from_image(file)
|
|
1032
|
+
self.extracted_texts[file] = text
|
|
1033
|
+
self.log_message(f" [{i}/{len(self.image_files)}] Processed: {filename}")
|
|
1034
|
+
else:
|
|
1035
|
+
self.log_message(f" [{i}/{len(self.image_files)}] Skipped (already processed): {filename}")
|
|
1036
|
+
|
|
1037
|
+
self.progress.setValue(i)
|
|
1038
|
+
self._update_listbox()
|
|
1039
|
+
|
|
1040
|
+
self.log_message(f"Batch processing complete: {len(self.image_files)} images processed")
|
|
1041
|
+
self.status_label.setText(f"✓ Processed all {len(self.image_files)} images with {provider}!")
|
|
1042
|
+
QMessageBox.information(
|
|
1043
|
+
None,
|
|
1044
|
+
"Complete",
|
|
1045
|
+
f"Successfully processed {len(self.image_files)} images with {provider}!\n\n"
|
|
1046
|
+
"Click 'Save DOCX' to export the text."
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
# === Export Methods ===
|
|
1050
|
+
|
|
1051
|
+
def _add_formatted_text(self, doc, text):
|
|
1052
|
+
"""
|
|
1053
|
+
Add text to document with markdown formatting parsed
|
|
1054
|
+
Supports: **bold**, *italic*, __underline__
|
|
1055
|
+
Also handles multi-column layouts with [START COLUMN X] / [END COLUMN X] markers
|
|
1056
|
+
"""
|
|
1057
|
+
# Check if text has column markers
|
|
1058
|
+
if '[START COLUMN' in text or '[COLUMN 1]' in text or '[COLUMN 2]' in text:
|
|
1059
|
+
self._add_multi_column_text(doc, text)
|
|
1060
|
+
return
|
|
1061
|
+
|
|
1062
|
+
# Split text into paragraphs (separated by double newlines or more)
|
|
1063
|
+
# This prevents treating every single line as a paragraph
|
|
1064
|
+
paragraphs = re.split(r'\n\s*\n', text)
|
|
1065
|
+
|
|
1066
|
+
for para_text in paragraphs:
|
|
1067
|
+
if not para_text.strip():
|
|
1068
|
+
continue
|
|
1069
|
+
|
|
1070
|
+
# Replace single newlines within paragraph with spaces (removes extraneous line breaks)
|
|
1071
|
+
para_text = para_text.replace('\n', ' ').strip()
|
|
1072
|
+
|
|
1073
|
+
para = doc.add_paragraph()
|
|
1074
|
+
para.paragraph_format.line_spacing = 1.15
|
|
1075
|
+
para.paragraph_format.space_after = Pt(6) # Reduced from 12 to 6 for tighter spacing
|
|
1076
|
+
|
|
1077
|
+
# Parse markdown formatting using regex
|
|
1078
|
+
remaining = para_text
|
|
1079
|
+
|
|
1080
|
+
while remaining:
|
|
1081
|
+
# Check for bold (**text**)
|
|
1082
|
+
bold_match = re.match(r'\*\*(.*?)\*\*', remaining)
|
|
1083
|
+
if bold_match:
|
|
1084
|
+
run = para.add_run(bold_match.group(1))
|
|
1085
|
+
run.bold = True
|
|
1086
|
+
remaining = remaining[bold_match.end():]
|
|
1087
|
+
continue
|
|
1088
|
+
|
|
1089
|
+
# Check for underline (__text__)
|
|
1090
|
+
underline_match = re.match(r'__(.*?)__', remaining)
|
|
1091
|
+
if underline_match:
|
|
1092
|
+
run = para.add_run(underline_match.group(1))
|
|
1093
|
+
run.underline = True
|
|
1094
|
+
remaining = remaining[underline_match.end():]
|
|
1095
|
+
continue
|
|
1096
|
+
|
|
1097
|
+
# Check for italic (*text*)
|
|
1098
|
+
italic_match = re.match(r'\*(.*?)\*', remaining)
|
|
1099
|
+
if italic_match:
|
|
1100
|
+
run = para.add_run(italic_match.group(1))
|
|
1101
|
+
run.italic = True
|
|
1102
|
+
remaining = remaining[italic_match.end():]
|
|
1103
|
+
continue
|
|
1104
|
+
|
|
1105
|
+
# No formatting - add plain text until next marker or end
|
|
1106
|
+
next_marker = len(remaining)
|
|
1107
|
+
for marker in ['**', '*', '__']:
|
|
1108
|
+
pos = remaining.find(marker)
|
|
1109
|
+
if pos != -1 and pos < next_marker:
|
|
1110
|
+
next_marker = pos
|
|
1111
|
+
|
|
1112
|
+
if next_marker == 0:
|
|
1113
|
+
# Edge case: marker at start but no match (e.g., single * or **)
|
|
1114
|
+
para.add_run(remaining[0])
|
|
1115
|
+
remaining = remaining[1:]
|
|
1116
|
+
else:
|
|
1117
|
+
plain_text = remaining[:next_marker] if next_marker < len(remaining) else remaining
|
|
1118
|
+
if plain_text:
|
|
1119
|
+
para.add_run(plain_text)
|
|
1120
|
+
remaining = remaining[next_marker:]
|
|
1121
|
+
|
|
1122
|
+
def _add_multi_column_text(self, doc, text):
|
|
1123
|
+
"""
|
|
1124
|
+
Handle multi-column text layout using a Word table
|
|
1125
|
+
Supports markers like [START COLUMN 1], [END COLUMN 1], etc.
|
|
1126
|
+
"""
|
|
1127
|
+
# Parse columns from text
|
|
1128
|
+
columns = {}
|
|
1129
|
+
current_column = None
|
|
1130
|
+
lines = text.split('\n')
|
|
1131
|
+
|
|
1132
|
+
for line in lines:
|
|
1133
|
+
# Check for column start marker
|
|
1134
|
+
if '[START COLUMN' in line.upper() or '[COLUMN' in line.upper():
|
|
1135
|
+
# Extract column number
|
|
1136
|
+
import re
|
|
1137
|
+
match = re.search(r'\[(?:START )?COLUMN[:\s]+(\d+)\]', line, re.IGNORECASE)
|
|
1138
|
+
if match:
|
|
1139
|
+
current_column = int(match.group(1))
|
|
1140
|
+
if current_column not in columns:
|
|
1141
|
+
columns[current_column] = []
|
|
1142
|
+
continue
|
|
1143
|
+
|
|
1144
|
+
# Check for column end marker
|
|
1145
|
+
if '[END COLUMN' in line.upper():
|
|
1146
|
+
current_column = None
|
|
1147
|
+
continue
|
|
1148
|
+
|
|
1149
|
+
# Add line to current column
|
|
1150
|
+
if current_column is not None:
|
|
1151
|
+
columns[current_column].append(line)
|
|
1152
|
+
|
|
1153
|
+
# If we found columns, create a table layout
|
|
1154
|
+
if columns:
|
|
1155
|
+
num_columns = max(columns.keys()) if columns else 2
|
|
1156
|
+
table = doc.add_table(rows=1, cols=num_columns)
|
|
1157
|
+
table.style = 'Table Grid'
|
|
1158
|
+
|
|
1159
|
+
# Set column widths to be equal
|
|
1160
|
+
from docx.shared import Inches
|
|
1161
|
+
for col_idx in range(num_columns):
|
|
1162
|
+
for cell in table.columns[col_idx].cells:
|
|
1163
|
+
cell.width = Inches(6.5 / num_columns)
|
|
1164
|
+
|
|
1165
|
+
# Fill each column
|
|
1166
|
+
for col_num in sorted(columns.keys()):
|
|
1167
|
+
if col_num <= num_columns:
|
|
1168
|
+
cell = table.cell(0, col_num - 1)
|
|
1169
|
+
column_text = '\n'.join(columns[col_num])
|
|
1170
|
+
|
|
1171
|
+
# Remove the cell's default paragraph and add formatted text
|
|
1172
|
+
cell.text = ''
|
|
1173
|
+
for para_text in column_text.split('\n'):
|
|
1174
|
+
if para_text.strip():
|
|
1175
|
+
para = cell.add_paragraph(para_text.strip())
|
|
1176
|
+
para.paragraph_format.line_spacing = 1.0
|
|
1177
|
+
para.paragraph_format.space_after = Pt(6)
|
|
1178
|
+
|
|
1179
|
+
# Remove table borders for cleaner look
|
|
1180
|
+
for row in table.rows:
|
|
1181
|
+
for cell in row.cells:
|
|
1182
|
+
tcPr = cell._element.get_or_add_tcPr()
|
|
1183
|
+
tcBorders = tcPr.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tcBorders')
|
|
1184
|
+
if tcBorders is None:
|
|
1185
|
+
from docx.oxml import OxmlElement
|
|
1186
|
+
tcBorders = OxmlElement('w:tcBorders')
|
|
1187
|
+
tcPr.append(tcBorders)
|
|
1188
|
+
# Set all borders to none
|
|
1189
|
+
for border_name in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']:
|
|
1190
|
+
border = tcBorders.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' + border_name)
|
|
1191
|
+
if border is None:
|
|
1192
|
+
from docx.oxml import OxmlElement
|
|
1193
|
+
border = OxmlElement(f'w:{border_name}')
|
|
1194
|
+
tcBorders.append(border)
|
|
1195
|
+
border.set('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'none')
|
|
1196
|
+
else:
|
|
1197
|
+
# No columns found, fall back to regular formatting
|
|
1198
|
+
self._add_formatted_text(doc, text)
|
|
1199
|
+
|
|
1200
|
+
def _parse_markdown_table(self, table_lines):
|
|
1201
|
+
"""Parse markdown table lines into rows and cells"""
|
|
1202
|
+
rows = []
|
|
1203
|
+
for line in table_lines:
|
|
1204
|
+
line = line.strip()
|
|
1205
|
+
if not line or not line.startswith('|'):
|
|
1206
|
+
continue
|
|
1207
|
+
# Split by pipe, remove first and last empty elements
|
|
1208
|
+
cells = [cell.strip() for cell in line.split('|')[1:-1]]
|
|
1209
|
+
# Skip separator row (contains only dashes, colons, and spaces)
|
|
1210
|
+
is_separator = all(
|
|
1211
|
+
cell.replace('-', '').replace(':', '').replace(' ', '').strip() == ''
|
|
1212
|
+
for cell in cells
|
|
1213
|
+
)
|
|
1214
|
+
if is_separator:
|
|
1215
|
+
continue
|
|
1216
|
+
rows.append(cells)
|
|
1217
|
+
return rows
|
|
1218
|
+
|
|
1219
|
+
def _extract_markdown_tables(self, text):
|
|
1220
|
+
"""Extract markdown tables from text and return (text_without_tables, tables_list)"""
|
|
1221
|
+
lines = text.split('\n')
|
|
1222
|
+
result_lines = []
|
|
1223
|
+
tables = []
|
|
1224
|
+
current_table_lines = []
|
|
1225
|
+
in_table = False
|
|
1226
|
+
|
|
1227
|
+
for i, line in enumerate(lines):
|
|
1228
|
+
stripped = line.strip()
|
|
1229
|
+
# Check if line looks like a table row (starts and ends with |)
|
|
1230
|
+
is_table_row = stripped.startswith('|') and stripped.endswith('|') and '|' in stripped[1:-1]
|
|
1231
|
+
|
|
1232
|
+
if is_table_row:
|
|
1233
|
+
if not in_table:
|
|
1234
|
+
in_table = True
|
|
1235
|
+
current_table_lines = []
|
|
1236
|
+
current_table_lines.append(line)
|
|
1237
|
+
else:
|
|
1238
|
+
# End of table
|
|
1239
|
+
if in_table:
|
|
1240
|
+
# Parse the table
|
|
1241
|
+
parsed_table = self._parse_markdown_table(current_table_lines)
|
|
1242
|
+
if parsed_table and len(parsed_table) > 0:
|
|
1243
|
+
tables.append(parsed_table)
|
|
1244
|
+
# Add placeholder for table
|
|
1245
|
+
result_lines.append(f"[TABLE_{len(tables) - 1}]")
|
|
1246
|
+
in_table = False
|
|
1247
|
+
current_table_lines = []
|
|
1248
|
+
result_lines.append(line)
|
|
1249
|
+
|
|
1250
|
+
# Handle table at end of text
|
|
1251
|
+
if in_table and current_table_lines:
|
|
1252
|
+
parsed_table = self._parse_markdown_table(current_table_lines)
|
|
1253
|
+
if parsed_table and len(parsed_table) > 0:
|
|
1254
|
+
tables.append(parsed_table)
|
|
1255
|
+
result_lines.append(f"[TABLE_{len(tables) - 1}]")
|
|
1256
|
+
|
|
1257
|
+
text_without_tables = '\n'.join(result_lines)
|
|
1258
|
+
return text_without_tables, tables
|
|
1259
|
+
|
|
1260
|
+
def _add_markdown_table_to_doc(self, doc, table_rows):
|
|
1261
|
+
"""Add a Word table from markdown table rows"""
|
|
1262
|
+
if not table_rows or len(table_rows) == 0:
|
|
1263
|
+
return
|
|
1264
|
+
|
|
1265
|
+
# Determine number of columns (use first row, usually header)
|
|
1266
|
+
num_cols = len(table_rows[0]) if table_rows else 1
|
|
1267
|
+
num_rows = len(table_rows)
|
|
1268
|
+
|
|
1269
|
+
# Create Word table
|
|
1270
|
+
table = doc.add_table(rows=num_rows, cols=num_cols)
|
|
1271
|
+
table.style = 'Light Grid Accent 1'
|
|
1272
|
+
|
|
1273
|
+
# Populate table cells
|
|
1274
|
+
for row_idx, row_data in enumerate(table_rows):
|
|
1275
|
+
# Ensure row has enough cells
|
|
1276
|
+
while len(row_data) < num_cols:
|
|
1277
|
+
row_data.append('')
|
|
1278
|
+
|
|
1279
|
+
for col_idx in range(num_cols):
|
|
1280
|
+
cell_text = row_data[col_idx] if col_idx < len(row_data) else ''
|
|
1281
|
+
# Remove markdown formatting from table cells (keep plain text for now)
|
|
1282
|
+
# Could enhance this to support formatting in cells later
|
|
1283
|
+
cell_text = cell_text.replace('**', '').replace('*', '').replace('__', '')
|
|
1284
|
+
table.rows[row_idx].cells[col_idx].text = cell_text
|
|
1285
|
+
|
|
1286
|
+
# Add spacing after table
|
|
1287
|
+
doc.add_paragraph()
|
|
1288
|
+
|
|
1289
|
+
def _add_formatted_text_with_tables(self, doc, text):
|
|
1290
|
+
"""Add text to document with markdown formatting and tables parsed"""
|
|
1291
|
+
# Extract tables first
|
|
1292
|
+
text_without_tables, tables = self._extract_markdown_tables(text)
|
|
1293
|
+
|
|
1294
|
+
# Split text into paragraphs (now with table placeholders)
|
|
1295
|
+
parts = text_without_tables.split('\n')
|
|
1296
|
+
|
|
1297
|
+
current_paragraph_text = []
|
|
1298
|
+
|
|
1299
|
+
for part in parts:
|
|
1300
|
+
# Check if this is a table placeholder
|
|
1301
|
+
if part.strip().startswith('[TABLE_') and part.strip().endswith(']'):
|
|
1302
|
+
# Add any accumulated paragraph text
|
|
1303
|
+
if current_paragraph_text:
|
|
1304
|
+
para_text = '\n'.join(current_paragraph_text).strip()
|
|
1305
|
+
if para_text:
|
|
1306
|
+
self._add_formatted_text(doc, para_text)
|
|
1307
|
+
current_paragraph_text = []
|
|
1308
|
+
|
|
1309
|
+
# Extract table index
|
|
1310
|
+
try:
|
|
1311
|
+
table_idx = int(part.strip()[7:-1]) # Extract number from [TABLE_0]
|
|
1312
|
+
if 0 <= table_idx < len(tables):
|
|
1313
|
+
self._add_markdown_table_to_doc(doc, tables[table_idx])
|
|
1314
|
+
except (ValueError, IndexError):
|
|
1315
|
+
pass
|
|
1316
|
+
else:
|
|
1317
|
+
current_paragraph_text.append(part)
|
|
1318
|
+
|
|
1319
|
+
# Add any remaining paragraph text
|
|
1320
|
+
if current_paragraph_text:
|
|
1321
|
+
para_text = '\n'.join(current_paragraph_text).strip()
|
|
1322
|
+
if para_text:
|
|
1323
|
+
self._add_formatted_text(doc, para_text)
|
|
1324
|
+
|
|
1325
|
+
def _generate_markdown_file(self, output_file):
|
|
1326
|
+
"""Generate markdown file from extracted texts"""
|
|
1327
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
1328
|
+
# Header
|
|
1329
|
+
f.write("# Extracted Text from Images\n\n")
|
|
1330
|
+
f.write("*Generated by PDF Rescue (a Supervertaler module)*\n\n")
|
|
1331
|
+
f.write("---\n\n")
|
|
1332
|
+
|
|
1333
|
+
# Add extracted text in order
|
|
1334
|
+
for i, file in enumerate(self.image_files, 1):
|
|
1335
|
+
if file in self.extracted_texts:
|
|
1336
|
+
# Page header
|
|
1337
|
+
f.write(f"## Page {i}: {os.path.basename(file)}\n\n")
|
|
1338
|
+
|
|
1339
|
+
# Text content (already in markdown format)
|
|
1340
|
+
text = self.extracted_texts[file]
|
|
1341
|
+
f.write(text)
|
|
1342
|
+
f.write("\n\n")
|
|
1343
|
+
f.write("---\n\n")
|
|
1344
|
+
|
|
1345
|
+
def _markdown_to_docx(self, md_file, docx_file):
|
|
1346
|
+
"""Convert markdown file to DOCX with proper table handling"""
|
|
1347
|
+
# Read markdown file
|
|
1348
|
+
with open(md_file, 'r', encoding='utf-8') as f:
|
|
1349
|
+
md_content = f.read()
|
|
1350
|
+
|
|
1351
|
+
# Create Word document
|
|
1352
|
+
doc = Document()
|
|
1353
|
+
|
|
1354
|
+
# Parse markdown content line by line
|
|
1355
|
+
lines = md_content.split('\n')
|
|
1356
|
+
i = 0
|
|
1357
|
+
|
|
1358
|
+
while i < len(lines):
|
|
1359
|
+
line = lines[i].rstrip() # Keep left whitespace, remove right
|
|
1360
|
+
|
|
1361
|
+
# Skip empty lines (add spacing as paragraph)
|
|
1362
|
+
if not line.strip():
|
|
1363
|
+
doc.add_paragraph()
|
|
1364
|
+
i += 1
|
|
1365
|
+
continue
|
|
1366
|
+
|
|
1367
|
+
# Handle headers
|
|
1368
|
+
if line.startswith('# '):
|
|
1369
|
+
title = doc.add_heading(line[2:].strip(), 0)
|
|
1370
|
+
title.runs[0].font.size = Pt(16)
|
|
1371
|
+
i += 1
|
|
1372
|
+
continue
|
|
1373
|
+
elif line.startswith('## '):
|
|
1374
|
+
heading = doc.add_heading(line[3:].strip(), level=2)
|
|
1375
|
+
heading.runs[0].font.size = Pt(12)
|
|
1376
|
+
i += 1
|
|
1377
|
+
continue
|
|
1378
|
+
elif line.startswith('### '):
|
|
1379
|
+
heading = doc.add_heading(line[4:].strip(), level=3)
|
|
1380
|
+
i += 1
|
|
1381
|
+
continue
|
|
1382
|
+
|
|
1383
|
+
# Handle horizontal rule
|
|
1384
|
+
if line.strip() == '---' or line.strip() == '***':
|
|
1385
|
+
doc.add_paragraph()
|
|
1386
|
+
i += 1
|
|
1387
|
+
continue
|
|
1388
|
+
|
|
1389
|
+
# Handle italics line (branding) - simple single-line italic
|
|
1390
|
+
stripped = line.strip()
|
|
1391
|
+
if stripped.startswith('*') and stripped.endswith('*') and stripped.count('*') == 2 and len(stripped) > 2:
|
|
1392
|
+
para = doc.add_paragraph()
|
|
1393
|
+
run = para.add_run(stripped[1:-1])
|
|
1394
|
+
run.italic = True
|
|
1395
|
+
run.font.size = Pt(9)
|
|
1396
|
+
i += 1
|
|
1397
|
+
continue
|
|
1398
|
+
|
|
1399
|
+
# Check if this might be a table
|
|
1400
|
+
if line.strip().startswith('|') and line.strip().endswith('|') and '|' in line.strip()[1:-1]:
|
|
1401
|
+
# Collect all table lines
|
|
1402
|
+
table_lines = [line]
|
|
1403
|
+
i += 1
|
|
1404
|
+
while i < len(lines):
|
|
1405
|
+
next_line = lines[i].rstrip()
|
|
1406
|
+
# Continue if it's a table row or separator row
|
|
1407
|
+
if next_line.strip().startswith('|') and next_line.strip().endswith('|'):
|
|
1408
|
+
table_lines.append(next_line)
|
|
1409
|
+
i += 1
|
|
1410
|
+
else:
|
|
1411
|
+
break
|
|
1412
|
+
|
|
1413
|
+
# Parse and add table
|
|
1414
|
+
parsed_table = self._parse_markdown_table(table_lines)
|
|
1415
|
+
if parsed_table and len(parsed_table) > 0:
|
|
1416
|
+
self._add_markdown_table_to_doc(doc, parsed_table)
|
|
1417
|
+
continue
|
|
1418
|
+
|
|
1419
|
+
# Regular paragraph text - collect until empty line or special marker
|
|
1420
|
+
para_lines = [line]
|
|
1421
|
+
i += 1
|
|
1422
|
+
while i < len(lines):
|
|
1423
|
+
next_line = lines[i].rstrip()
|
|
1424
|
+
# Stop at empty line or special markers
|
|
1425
|
+
if not next_line.strip():
|
|
1426
|
+
break
|
|
1427
|
+
if next_line.strip().startswith('#') or next_line.strip() in ['---', '***']:
|
|
1428
|
+
break
|
|
1429
|
+
if next_line.strip().startswith('|') and next_line.strip().endswith('|'):
|
|
1430
|
+
break
|
|
1431
|
+
para_lines.append(next_line)
|
|
1432
|
+
i += 1
|
|
1433
|
+
|
|
1434
|
+
# Process paragraph with tables
|
|
1435
|
+
para_text = '\n'.join(para_lines)
|
|
1436
|
+
if para_text.strip():
|
|
1437
|
+
self._add_formatted_text_with_tables(doc, para_text)
|
|
1438
|
+
|
|
1439
|
+
# Save document
|
|
1440
|
+
doc.save(docx_file)
|
|
1441
|
+
|
|
1442
|
+
def _export_markdown_and_word(self):
|
|
1443
|
+
"""Export extracted text as markdown, convert to DOCX, and generate session report"""
|
|
1444
|
+
if not self.extracted_texts:
|
|
1445
|
+
QMessageBox.warning(
|
|
1446
|
+
None,
|
|
1447
|
+
"No Text",
|
|
1448
|
+
"No extracted text to export.\n\n"
|
|
1449
|
+
"Process images first."
|
|
1450
|
+
)
|
|
1451
|
+
return
|
|
1452
|
+
|
|
1453
|
+
# Ask for base file name (will generate .md, .docx, and _report.md)
|
|
1454
|
+
output_file, _ = QFileDialog.getSaveFileName(
|
|
1455
|
+
parent=None,
|
|
1456
|
+
caption="Export Markdown & Word Documents",
|
|
1457
|
+
filter="Markdown files (*.md);;All files (*.*)",
|
|
1458
|
+
initialFilter="Markdown files (*.md)"
|
|
1459
|
+
)
|
|
1460
|
+
|
|
1461
|
+
if not output_file:
|
|
1462
|
+
return
|
|
1463
|
+
|
|
1464
|
+
# Generate file names
|
|
1465
|
+
base_path = Path(output_file)
|
|
1466
|
+
if base_path.suffix != '.md':
|
|
1467
|
+
base_path = base_path.with_suffix('.md')
|
|
1468
|
+
|
|
1469
|
+
md_file = str(base_path)
|
|
1470
|
+
docx_file = str(base_path.with_suffix('.docx'))
|
|
1471
|
+
report_file = str(base_path.parent / f"{base_path.stem}_report.md")
|
|
1472
|
+
|
|
1473
|
+
self.log_message(f"Exporting documents: {base_path.name}")
|
|
1474
|
+
|
|
1475
|
+
try:
|
|
1476
|
+
# Step 1: Generate markdown file
|
|
1477
|
+
self._generate_markdown_file(md_file)
|
|
1478
|
+
self.log_message(f"✓ Markdown file created: {base_path.name}")
|
|
1479
|
+
|
|
1480
|
+
# Step 2: Convert markdown to DOCX
|
|
1481
|
+
self._markdown_to_docx(md_file, docx_file)
|
|
1482
|
+
self.log_message(f"✓ Word document created: {base_path.stem}.docx")
|
|
1483
|
+
|
|
1484
|
+
# Step 3: Generate session report
|
|
1485
|
+
self._save_session_report_to_file(report_file)
|
|
1486
|
+
self.log_message(f"✓ Session report created: {base_path.stem}_report.md")
|
|
1487
|
+
|
|
1488
|
+
# Success message
|
|
1489
|
+
files_created = f"✓ {base_path.name}\n✓ {base_path.stem}.docx\n✓ {base_path.stem}_report.md"
|
|
1490
|
+
self.status_label.setText(f"✓ Exported 3 files successfully")
|
|
1491
|
+
|
|
1492
|
+
reply = QMessageBox.question(
|
|
1493
|
+
None,
|
|
1494
|
+
"Export Complete",
|
|
1495
|
+
f"Successfully exported all documents!\n\n"
|
|
1496
|
+
f"Files created:\n{files_created}\n\n"
|
|
1497
|
+
"Open the files now?",
|
|
1498
|
+
QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No
|
|
1499
|
+
)
|
|
1500
|
+
if reply == QMessageBox.StandardButton.Yes:
|
|
1501
|
+
os.startfile(md_file)
|
|
1502
|
+
os.startfile(docx_file)
|
|
1503
|
+
os.startfile(report_file)
|
|
1504
|
+
|
|
1505
|
+
except Exception as e:
|
|
1506
|
+
QMessageBox.critical(None, "Error", f"Failed to export documents:\n\n{str(e)}")
|
|
1507
|
+
self.log_message(f"✗ Export error: {str(e)}")
|
|
1508
|
+
|
|
1509
|
+
def _save_session_report_to_file(self, output_file):
|
|
1510
|
+
"""Generate and save session report to specified file (internal method)"""
|
|
1511
|
+
from datetime import datetime
|
|
1512
|
+
|
|
1513
|
+
# Generate report content
|
|
1514
|
+
report_lines = []
|
|
1515
|
+
report_lines.append("# PDF Rescue - Session Report\n")
|
|
1516
|
+
report_lines.append("**Generated by [Supervertaler](https://supervertaler.com/) • by Michael Beijer**\n\n")
|
|
1517
|
+
report_lines.append(f"**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
1518
|
+
report_lines.append("---\n\n")
|
|
1519
|
+
|
|
1520
|
+
# Configuration section
|
|
1521
|
+
report_lines.append("## Configuration\n\n")
|
|
1522
|
+
report_lines.append(f"- **Model**: {self.model_combo.currentText()}\n")
|
|
1523
|
+
formatting_status = "Enabled ✓" if self.preserve_formatting_check.isChecked() else "Disabled ✗"
|
|
1524
|
+
report_lines.append(f"- **Formatting Preservation**: {formatting_status}\n")
|
|
1525
|
+
report_lines.append(f"- **Total Images Processed**: {len(self.extracted_texts)}\n")
|
|
1526
|
+
report_lines.append(f"- **Total Images in List**: {len(self.image_files)}\n\n")
|
|
1527
|
+
|
|
1528
|
+
# Instructions used
|
|
1529
|
+
report_lines.append("## Extraction Instructions\n\n")
|
|
1530
|
+
report_lines.append("```\n")
|
|
1531
|
+
instructions = self.instructions_text.toPlainText().strip()
|
|
1532
|
+
if self.preserve_formatting_check.isChecked():
|
|
1533
|
+
if "markdown for text formatting" not in instructions:
|
|
1534
|
+
instructions += "\n- Use markdown for text formatting: **bold text**, *italic text*, __underlined text__"
|
|
1535
|
+
report_lines.append(instructions)
|
|
1536
|
+
report_lines.append("\n```\n\n")
|
|
1537
|
+
|
|
1538
|
+
# Processing summary
|
|
1539
|
+
report_lines.append("## Processing Summary\n\n")
|
|
1540
|
+
report_lines.append("| # | Image File | Status |\n")
|
|
1541
|
+
report_lines.append("|---|------------|--------|\n")
|
|
1542
|
+
|
|
1543
|
+
for i, file in enumerate(self.image_files, 1):
|
|
1544
|
+
filename = os.path.basename(file)
|
|
1545
|
+
status = "✓ Processed" if file in self.extracted_texts else "⧗ Pending"
|
|
1546
|
+
report_lines.append(f"| {i} | {filename} | {status} |\n")
|
|
1547
|
+
|
|
1548
|
+
report_lines.append("\n---\n\n")
|
|
1549
|
+
|
|
1550
|
+
# Extracted text for each image
|
|
1551
|
+
report_lines.append("## Extracted Text\n\n")
|
|
1552
|
+
|
|
1553
|
+
for i, file in enumerate(self.image_files, 1):
|
|
1554
|
+
if file in self.extracted_texts:
|
|
1555
|
+
filename = os.path.basename(file)
|
|
1556
|
+
report_lines.append(f"### Page {i}: {filename}\n\n")
|
|
1557
|
+
report_lines.append("```\n")
|
|
1558
|
+
report_lines.append(self.extracted_texts[file])
|
|
1559
|
+
report_lines.append("\n```\n\n")
|
|
1560
|
+
report_lines.append("---\n\n")
|
|
1561
|
+
|
|
1562
|
+
# Statistics
|
|
1563
|
+
report_lines.append("## Statistics\n\n")
|
|
1564
|
+
texts_list = list(self.extracted_texts.values())
|
|
1565
|
+
total_chars = sum(len(text) for text in texts_list)
|
|
1566
|
+
total_words = sum(len(text.split()) for text in texts_list)
|
|
1567
|
+
report_lines.append(f"- **Total Characters Extracted**: {total_chars:,}\n")
|
|
1568
|
+
report_lines.append(f"- **Total Words Extracted**: {total_words:,}\n")
|
|
1569
|
+
report_lines.append(f"- **Average Characters per Page**: {total_chars // len(self.extracted_texts) if self.extracted_texts else 0:,}\n")
|
|
1570
|
+
report_lines.append(f"- **Average Words per Page**: {total_words // len(self.extracted_texts) if self.extracted_texts else 0:,}\n\n")
|
|
1571
|
+
|
|
1572
|
+
# Footer
|
|
1573
|
+
report_lines.append("---\n\n")
|
|
1574
|
+
report_lines.append("*Report generated by **PDF Rescue** - AI-Powered OCR Tool*\n\n")
|
|
1575
|
+
report_lines.append("*Part of [**Supervertaler**](https://supervertaler.com/) • by Michael Beijer*\n")
|
|
1576
|
+
|
|
1577
|
+
# Write to file
|
|
1578
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
1579
|
+
f.writelines(report_lines)
|
|
1580
|
+
|
|
1581
|
+
def _save_to_docx(self):
|
|
1582
|
+
"""Save all extracted text to a Word document"""
|
|
1583
|
+
if not self.extracted_texts:
|
|
1584
|
+
QMessageBox.warning(
|
|
1585
|
+
None,
|
|
1586
|
+
"No Text",
|
|
1587
|
+
"No extracted text to save.\n\n"
|
|
1588
|
+
"Process images first."
|
|
1589
|
+
)
|
|
1590
|
+
return
|
|
1591
|
+
|
|
1592
|
+
output_file, _ = QFileDialog.getSaveFileName(
|
|
1593
|
+
parent=None,
|
|
1594
|
+
caption="Save Extracted Text",
|
|
1595
|
+
filter="Word Document (*.docx);;All files (*.*)",
|
|
1596
|
+
initialFilter="Word Document (*.docx)"
|
|
1597
|
+
)
|
|
1598
|
+
|
|
1599
|
+
if not output_file:
|
|
1600
|
+
return
|
|
1601
|
+
|
|
1602
|
+
self.log_message(f"Saving extracted text to DOCX: {Path(output_file).name}")
|
|
1603
|
+
|
|
1604
|
+
try:
|
|
1605
|
+
doc = Document()
|
|
1606
|
+
|
|
1607
|
+
# Add title
|
|
1608
|
+
title = doc.add_heading('Extracted Text from Images', 0)
|
|
1609
|
+
title.runs[0].font.size = Pt(16)
|
|
1610
|
+
|
|
1611
|
+
# Add branding with hyperlink to Supervertaler
|
|
1612
|
+
from docx.oxml import OxmlElement
|
|
1613
|
+
from docx.oxml.ns import qn
|
|
1614
|
+
|
|
1615
|
+
def add_hyperlink(paragraph, text, url):
|
|
1616
|
+
"""Add a hyperlink to a paragraph"""
|
|
1617
|
+
# Get the paragraph element
|
|
1618
|
+
part = paragraph.part
|
|
1619
|
+
r_id = part.relate_to(url, 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink', is_external=True)
|
|
1620
|
+
|
|
1621
|
+
# Create the hyperlink element
|
|
1622
|
+
hyperlink = OxmlElement('w:hyperlink')
|
|
1623
|
+
hyperlink.set(qn('r:id'), r_id)
|
|
1624
|
+
|
|
1625
|
+
# Create a new run element
|
|
1626
|
+
new_run = OxmlElement('w:r')
|
|
1627
|
+
rPr = OxmlElement('w:rPr')
|
|
1628
|
+
|
|
1629
|
+
# Add hyperlink style
|
|
1630
|
+
rStyle = OxmlElement('w:rStyle')
|
|
1631
|
+
rStyle.set(qn('w:val'), 'Hyperlink')
|
|
1632
|
+
rPr.append(rStyle)
|
|
1633
|
+
new_run.append(rPr)
|
|
1634
|
+
|
|
1635
|
+
# Add the text
|
|
1636
|
+
new_run.text = text
|
|
1637
|
+
hyperlink.append(new_run)
|
|
1638
|
+
|
|
1639
|
+
# Add hyperlink to paragraph
|
|
1640
|
+
paragraph._p.append(hyperlink)
|
|
1641
|
+
return hyperlink
|
|
1642
|
+
|
|
1643
|
+
branding_para = doc.add_paragraph()
|
|
1644
|
+
|
|
1645
|
+
# Add text before hyperlink
|
|
1646
|
+
run1 = branding_para.add_run('Generated by PDF Rescue (a ')
|
|
1647
|
+
run1.font.size = Pt(9)
|
|
1648
|
+
run1.italic = True
|
|
1649
|
+
|
|
1650
|
+
# Add hyperlink
|
|
1651
|
+
add_hyperlink(branding_para, 'Supervertaler', 'https://supervertaler.com/')
|
|
1652
|
+
|
|
1653
|
+
# Add text after hyperlink
|
|
1654
|
+
run2 = branding_para.add_run(' module)')
|
|
1655
|
+
run2.font.size = Pt(9)
|
|
1656
|
+
run2.italic = True
|
|
1657
|
+
|
|
1658
|
+
# Add spacing
|
|
1659
|
+
doc.add_paragraph()
|
|
1660
|
+
|
|
1661
|
+
# Add extracted text in order
|
|
1662
|
+
for i, file in enumerate(self.image_files, 1):
|
|
1663
|
+
if file in self.extracted_texts:
|
|
1664
|
+
# Page header
|
|
1665
|
+
heading = doc.add_heading(f'Page {i}: {os.path.basename(file)}', level=2)
|
|
1666
|
+
heading.runs[0].font.size = Pt(12)
|
|
1667
|
+
|
|
1668
|
+
# Text content with formatting
|
|
1669
|
+
text = self.extracted_texts[file]
|
|
1670
|
+
if self.preserve_formatting_check.isChecked():
|
|
1671
|
+
self._add_formatted_text(doc, text)
|
|
1672
|
+
else:
|
|
1673
|
+
# Split by double newlines to preserve paragraph breaks
|
|
1674
|
+
# Replace single newlines with spaces to remove extraneous line breaks
|
|
1675
|
+
paragraphs = re.split(r'\n\s*\n', text)
|
|
1676
|
+
for para_text in paragraphs:
|
|
1677
|
+
if para_text.strip():
|
|
1678
|
+
# Replace single newlines within paragraph with spaces
|
|
1679
|
+
para_text = para_text.replace('\n', ' ').strip()
|
|
1680
|
+
para = doc.add_paragraph(para_text)
|
|
1681
|
+
para.paragraph_format.line_spacing = 1.15
|
|
1682
|
+
para.paragraph_format.space_after = Pt(6)
|
|
1683
|
+
|
|
1684
|
+
# Page break except for last
|
|
1685
|
+
if i < len(self.image_files):
|
|
1686
|
+
doc.add_page_break()
|
|
1687
|
+
|
|
1688
|
+
doc.save(output_file)
|
|
1689
|
+
|
|
1690
|
+
self.log_message(f"Successfully saved {len(self.extracted_texts)} pages to: {Path(output_file).name}")
|
|
1691
|
+
self.status_label.setText(f"✓ Saved to {os.path.basename(output_file)}")
|
|
1692
|
+
|
|
1693
|
+
reply = QMessageBox.question(
|
|
1694
|
+
None,
|
|
1695
|
+
"Success",
|
|
1696
|
+
f"Document saved successfully!\n\n"
|
|
1697
|
+
f"{len(self.extracted_texts)} pages of text extracted\n\n"
|
|
1698
|
+
"Open the document now?",
|
|
1699
|
+
QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No
|
|
1700
|
+
)
|
|
1701
|
+
if reply == QMessageBox.StandardButton.Yes:
|
|
1702
|
+
os.startfile(output_file)
|
|
1703
|
+
|
|
1704
|
+
except Exception as e:
|
|
1705
|
+
QMessageBox.critical(None, "Error", f"Failed to save document:\n\n{str(e)}")
|
|
1706
|
+
|
|
1707
|
+
def _copy_all_text(self):
|
|
1708
|
+
"""Copy all extracted text to clipboard"""
|
|
1709
|
+
if not self.extracted_texts:
|
|
1710
|
+
QMessageBox.warning(None, "No Text", "No extracted text to copy")
|
|
1711
|
+
return
|
|
1712
|
+
|
|
1713
|
+
all_text = []
|
|
1714
|
+
for i, file in enumerate(self.image_files, 1):
|
|
1715
|
+
if file in self.extracted_texts:
|
|
1716
|
+
all_text.append(f"=== Page {i}: {os.path.basename(file)} ===\n")
|
|
1717
|
+
all_text.append(self.extracted_texts[file])
|
|
1718
|
+
all_text.append("\n\n")
|
|
1719
|
+
|
|
1720
|
+
combined = "".join(all_text)
|
|
1721
|
+
|
|
1722
|
+
# Use QApplication clipboard
|
|
1723
|
+
from PyQt6.QtGui import QClipboard
|
|
1724
|
+
app = QApplication.instance()
|
|
1725
|
+
if app:
|
|
1726
|
+
clipboard = app.clipboard()
|
|
1727
|
+
clipboard.setText(combined)
|
|
1728
|
+
|
|
1729
|
+
self.status_label.setText(f"✓ Copied {len(self.extracted_texts)} pages to clipboard")
|
|
1730
|
+
QMessageBox.information(None, "Copied", f"Copied text from {len(self.extracted_texts)} pages to clipboard!")
|
|
1731
|
+
|
|
1732
|
+
|
|
1733
|
+
# === Standalone Application ===
|
|
1734
|
+
|
|
1735
|
+
if __name__ == "__main__":
|
|
1736
|
+
"""Run PDF Rescue as a standalone application"""
|
|
1737
|
+
import sys
|
|
1738
|
+
from PyQt6.QtWidgets import QApplication, QMainWindow, QTextEdit, QLabel
|
|
1739
|
+
from PyQt6.QtCore import Qt
|
|
1740
|
+
|
|
1741
|
+
class StandaloneApp(QMainWindow):
|
|
1742
|
+
"""Minimal parent app for standalone mode"""
|
|
1743
|
+
def __init__(self):
|
|
1744
|
+
super().__init__()
|
|
1745
|
+
self.setWindowTitle("PDF Rescue - AI-Powered OCR Tool")
|
|
1746
|
+
self.setGeometry(100, 100, 1200, 800)
|
|
1747
|
+
|
|
1748
|
+
# Central widget
|
|
1749
|
+
central_widget = QWidget()
|
|
1750
|
+
self.setCentralWidget(central_widget)
|
|
1751
|
+
|
|
1752
|
+
# Main layout
|
|
1753
|
+
main_layout = QVBoxLayout(central_widget)
|
|
1754
|
+
main_layout.setContentsMargins(10, 10, 10, 10)
|
|
1755
|
+
|
|
1756
|
+
# Title
|
|
1757
|
+
title = QLabel("PDF Rescue - AI-Powered OCR Tool")
|
|
1758
|
+
title.setFont(QFont("Segoe UI", 14, QFont.Weight.Bold))
|
|
1759
|
+
title.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
1760
|
+
main_layout.addWidget(title)
|
|
1761
|
+
|
|
1762
|
+
# Load API keys
|
|
1763
|
+
self.api_keys = {}
|
|
1764
|
+
api_file = Path("api_keys.txt")
|
|
1765
|
+
if not api_file.exists():
|
|
1766
|
+
# Try user_data folder
|
|
1767
|
+
api_file = Path("user_data_private" if os.path.exists(".supervertaler.local") else "user_data") / "api_keys.txt"
|
|
1768
|
+
|
|
1769
|
+
if api_file.exists():
|
|
1770
|
+
with open(api_file, 'r', encoding='utf-8') as f:
|
|
1771
|
+
for line in f:
|
|
1772
|
+
line = line.strip()
|
|
1773
|
+
if line and not line.startswith('#') and '=' in line:
|
|
1774
|
+
key, value = line.split('=', 1)
|
|
1775
|
+
if 'openai' in key.lower():
|
|
1776
|
+
self.api_keys['openai'] = value.strip()
|
|
1777
|
+
|
|
1778
|
+
if not self.api_keys.get('openai'):
|
|
1779
|
+
QMessageBox.critical(
|
|
1780
|
+
self,
|
|
1781
|
+
"API Key Missing",
|
|
1782
|
+
"Could not find OpenAI API key in api_keys.txt\n\n"
|
|
1783
|
+
"Please add a line like:\nOPENAI_API_KEY=your-key-here\n\n"
|
|
1784
|
+
"Or place api_keys.txt in the user_data folder."
|
|
1785
|
+
)
|
|
1786
|
+
# Still create UI but warn user
|
|
1787
|
+
|
|
1788
|
+
# Create PDF Rescue instance (standalone mode)
|
|
1789
|
+
self.pdf_rescue = PDFRescueQt(self, standalone=True)
|
|
1790
|
+
pdf_rescue_widget = QWidget()
|
|
1791
|
+
self.pdf_rescue.create_tab(pdf_rescue_widget)
|
|
1792
|
+
main_layout.addWidget(pdf_rescue_widget)
|
|
1793
|
+
|
|
1794
|
+
# Add log at bottom
|
|
1795
|
+
log_group = QGroupBox("Activity Log")
|
|
1796
|
+
log_layout = QVBoxLayout(log_group)
|
|
1797
|
+
|
|
1798
|
+
self.log_text = QTextEdit()
|
|
1799
|
+
self.log_text.setReadOnly(True)
|
|
1800
|
+
self.log_text.setMaximumHeight(150)
|
|
1801
|
+
self.log_text.setFont(QFont("Consolas", 9))
|
|
1802
|
+
log_layout.addWidget(self.log_text)
|
|
1803
|
+
|
|
1804
|
+
main_layout.addWidget(log_group)
|
|
1805
|
+
|
|
1806
|
+
def log(self, message: str):
|
|
1807
|
+
"""Add message to log"""
|
|
1808
|
+
from datetime import datetime
|
|
1809
|
+
timestamp = datetime.now().strftime("%H:%M:%S")
|
|
1810
|
+
formatted_message = f"[{timestamp}] {message}"
|
|
1811
|
+
self.log_text.append(formatted_message)
|
|
1812
|
+
|
|
1813
|
+
def load_api_keys(self):
|
|
1814
|
+
"""Load API keys for compatibility"""
|
|
1815
|
+
return self.api_keys
|
|
1816
|
+
|
|
1817
|
+
# Create and run standalone app
|
|
1818
|
+
app = QApplication(sys.argv)
|
|
1819
|
+
window = StandaloneApp()
|
|
1820
|
+
window.show()
|
|
1821
|
+
sys.exit(app.exec())
|
|
1822
|
+
|