doctra 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,483 @@
1
+ """
2
+ Enhanced Parser UI Module
3
+
4
+ This module contains all functionality for the Enhanced Parser tab in the Doctra Gradio interface.
5
+ It handles PDF parsing with DocRes image restoration, providing before/after comparison
6
+ and comprehensive document enhancement capabilities.
7
+ """
8
+
9
+ import tempfile
10
+ import traceback
11
+ from pathlib import Path
12
+ from typing import Tuple, List, Optional
13
+
14
+ import gradio as gr
15
+
16
+ from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
17
+ from doctra.utils.pdf_io import render_pdf_to_images
18
+ from doctra.ui.ui_helpers import gather_outputs, validate_vlm_config, create_page_html_content
19
+
20
+
21
+ def run_enhanced_parse(
22
+ pdf_file: str,
23
+ use_image_restoration: bool,
24
+ restoration_task: str,
25
+ restoration_device: str,
26
+ restoration_dpi: int,
27
+ use_vlm: bool,
28
+ vlm_provider: str,
29
+ vlm_api_key: str,
30
+ layout_model_name: str,
31
+ dpi: int,
32
+ min_score: float,
33
+ ocr_lang: str,
34
+ ocr_psm: int,
35
+ ocr_oem: int,
36
+ ocr_extra_config: str,
37
+ box_separator: str,
38
+ ) -> Tuple[str, Optional[str], List[str], str, Optional[str], Optional[str], str]:
39
+ """
40
+ Run enhanced PDF parsing with DocRes image restoration.
41
+
42
+ Args:
43
+ pdf_file: Path to input PDF file
44
+ use_image_restoration: Whether to apply DocRes image restoration
45
+ restoration_task: DocRes restoration task
46
+ restoration_device: Device for DocRes processing
47
+ restoration_dpi: DPI for restoration processing
48
+ use_vlm: Whether to use Vision Language Model
49
+ vlm_provider: VLM provider name
50
+ vlm_api_key: API key for VLM provider
51
+ layout_model_name: Layout detection model name
52
+ dpi: DPI for image processing
53
+ min_score: Minimum confidence score for layout detection
54
+ ocr_lang: OCR language code
55
+ ocr_psm: Tesseract PSM mode
56
+ ocr_oem: Tesseract OEM mode
57
+ ocr_extra_config: Additional OCR configuration
58
+ box_separator: Separator for bounding boxes
59
+
60
+ Returns:
61
+ Tuple of (status_message, markdown_preview, file_paths, zip_path, original_pdf_path, enhanced_pdf_path, output_dir)
62
+ """
63
+ if not pdf_file:
64
+ return ("No file provided.", None, [], "", None, None, "")
65
+
66
+ # Validate VLM configuration if VLM is enabled
67
+ if use_vlm:
68
+ vlm_error = validate_vlm_config(use_vlm, vlm_api_key)
69
+ if vlm_error:
70
+ return (vlm_error, None, [], "", None, None, "")
71
+
72
+ original_filename = Path(pdf_file).stem
73
+
74
+ # Create temporary directory for processing
75
+ tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_enhanced_"))
76
+ input_pdf = tmp_dir / f"{original_filename}.pdf"
77
+ import shutil
78
+ shutil.copy2(pdf_file, input_pdf)
79
+
80
+ try:
81
+ # Initialize enhanced parser with configuration
82
+ parser = EnhancedPDFParser(
83
+ use_image_restoration=use_image_restoration,
84
+ restoration_task=restoration_task,
85
+ restoration_device=restoration_device if restoration_device != "auto" else None,
86
+ restoration_dpi=int(restoration_dpi),
87
+ use_vlm=use_vlm,
88
+ vlm_provider=vlm_provider,
89
+ vlm_api_key=vlm_api_key or None,
90
+ layout_model_name=layout_model_name,
91
+ dpi=int(dpi),
92
+ min_score=float(min_score),
93
+ ocr_lang=ocr_lang,
94
+ ocr_psm=int(ocr_psm),
95
+ ocr_oem=int(ocr_oem),
96
+ ocr_extra_config=ocr_extra_config or "",
97
+ box_separator=box_separator or "\n",
98
+ )
99
+
100
+ # Parse the PDF with enhancement
101
+ parser.parse(str(input_pdf))
102
+
103
+ except Exception as e:
104
+ traceback.print_exc()
105
+ # Safely encode error message for return value
106
+ try:
107
+ error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
108
+ return (f"❌ Enhanced parsing failed: {error_msg}", None, [], "", None, None, "")
109
+ except Exception:
110
+ return (f"❌ Enhanced parsing failed: <Unicode encoding error>", None, [], "", None, None, "")
111
+
112
+ # Find output directory
113
+ outputs_root = Path("outputs")
114
+ out_dir = outputs_root / original_filename / "enhanced_parse"
115
+ if not out_dir.exists():
116
+ # fallback: search latest created dir under outputs
117
+ candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
118
+ if candidates:
119
+ out_dir = candidates[0] / "enhanced_parse"
120
+ else:
121
+ out_dir = outputs_root
122
+
123
+ # If still no enhanced_parse directory, try to find any directory with enhanced files
124
+ if not out_dir.exists():
125
+ # Look for any directory containing enhanced PDFs
126
+ for candidate_dir in outputs_root.rglob("*"):
127
+ if candidate_dir.is_dir():
128
+ enhanced_pdfs = list(candidate_dir.glob("*enhanced*.pdf"))
129
+ if enhanced_pdfs:
130
+ out_dir = candidate_dir
131
+ break
132
+
133
+ # Load first page content initially (page-specific content)
134
+ md_preview = None
135
+ try:
136
+ # Try to load the first page content from pages folder
137
+ pages_dir = out_dir / "pages"
138
+ first_page_path = pages_dir / "page_001.md"
139
+ if first_page_path.exists():
140
+ with first_page_path.open("r", encoding="utf-8", errors="ignore") as f:
141
+ md_content = f.read()
142
+
143
+ # Convert markdown to HTML with embedded images
144
+ md_lines = md_content.split('\n')
145
+ md_preview = create_page_html_content(md_lines, out_dir)
146
+ else:
147
+ # Fallback to full markdown file if page-specific files don't exist
148
+ md_file = next(out_dir.glob("*.md"), None)
149
+ if md_file and md_file.exists():
150
+ with md_file.open("r", encoding="utf-8", errors="ignore") as f:
151
+ md_content = f.read()
152
+
153
+ # Convert markdown to HTML with embedded images
154
+ md_lines = md_content.split('\n')
155
+ md_preview = create_page_html_content(md_lines, out_dir)
156
+ except Exception as e:
157
+ print(f"❌ Error loading initial content: {e}")
158
+ md_preview = None
159
+
160
+ # Gather output files and create ZIP
161
+ _, file_paths, zip_path = gather_outputs(
162
+ out_dir,
163
+ zip_filename=f"{original_filename}_enhanced",
164
+ is_structured_parsing=False
165
+ )
166
+
167
+ # Look for enhanced PDF file
168
+ enhanced_pdf_path = None
169
+ if use_image_restoration:
170
+ # Look for enhanced PDF in the output directory
171
+ enhanced_pdf_candidates = list(out_dir.glob("*enhanced*.pdf"))
172
+ if enhanced_pdf_candidates:
173
+ enhanced_pdf_path = str(enhanced_pdf_candidates[0])
174
+ print(f"✅ Found enhanced PDF: {enhanced_pdf_path}")
175
+ else:
176
+ # Look in parent directory
177
+ parent_enhanced = list(out_dir.parent.glob("*enhanced*.pdf"))
178
+ if parent_enhanced:
179
+ enhanced_pdf_path = str(parent_enhanced[0])
180
+ print(f"✅ Found enhanced PDF in parent: {enhanced_pdf_path}")
181
+ else:
182
+ print(f"⚠️ No enhanced PDF found in {out_dir} or parent directory")
183
+ # Debug: list all files in the directory
184
+ all_files = list(out_dir.glob("*"))
185
+ print(f"📁 Files in output directory: {[f.name for f in all_files]}")
186
+
187
+ return (
188
+ f"✅ Enhanced parsing completed successfully!\n📁 Output directory: {out_dir}",
189
+ md_preview,
190
+ file_paths,
191
+ zip_path,
192
+ pdf_file, # Original PDF path
193
+ enhanced_pdf_path, # Enhanced PDF path
194
+ str(out_dir) # Output directory for page-specific content
195
+ )
196
+
197
+
198
+ def render_pdf_pages_for_comparison(pdf_path: str, max_pages: int = 10) -> Tuple[List[str], List[str]]:
199
+ """
200
+ Render PDF pages to images for before/after comparison.
201
+
202
+ Args:
203
+ pdf_path: Path to PDF file
204
+ max_pages: Maximum number of pages to render
205
+
206
+ Returns:
207
+ Tuple of (image_paths, page_options)
208
+ """
209
+ if not pdf_path or not Path(pdf_path).exists():
210
+ return [], []
211
+
212
+ try:
213
+ # render_pdf_to_images returns (pil_image, width, height) tuples
214
+ image_tuples = render_pdf_to_images(pdf_path)
215
+
216
+ # Limit to max_pages if specified
217
+ if max_pages and len(image_tuples) > max_pages:
218
+ image_tuples = image_tuples[:max_pages]
219
+
220
+ # Convert PIL images to file paths for display
221
+ images = []
222
+ page_options = []
223
+
224
+ for i, (pil_image, width, height) in enumerate(image_tuples):
225
+ # Save PIL image to temporary file
226
+ temp_file = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
227
+ pil_image.save(temp_file.name, 'PNG')
228
+ images.append(temp_file.name)
229
+ page_options.append(f"Page {i+1}")
230
+
231
+ return images, page_options
232
+ except Exception as e:
233
+ print(f"Error rendering PDF pages: {e}")
234
+ return [], []
235
+
236
+
237
+ def update_enhanced_page_selector(original_pdf: str, enhanced_pdf: str) -> Tuple[gr.Dropdown, List[str], List[str], str, str, Optional[str], Optional[str]]:
238
+ """
239
+ Update page selector when PDFs are loaded for comparison.
240
+
241
+ Args:
242
+ original_pdf: Path to original PDF file
243
+ enhanced_pdf: Path to enhanced PDF file
244
+
245
+ Returns:
246
+ Tuple of (dropdown, original_pages, enhanced_pages, original_pdf_path, enhanced_pdf_path, first_original_image, first_enhanced_image)
247
+ """
248
+ original_pages, original_options = render_pdf_pages_for_comparison(original_pdf) if original_pdf else ([], [])
249
+ enhanced_pages, enhanced_options = render_pdf_pages_for_comparison(enhanced_pdf) if enhanced_pdf else ([], [])
250
+
251
+ # Use the same page options for the single selector (use the longer list)
252
+ if len(original_options) >= len(enhanced_options):
253
+ common_options = original_options
254
+ else:
255
+ common_options = enhanced_options
256
+
257
+ # Set default to first page if available
258
+ default_page = common_options[0] if common_options else None
259
+
260
+ return (
261
+ gr.Dropdown(choices=common_options, value=default_page, visible=bool(common_options)),
262
+ original_pages,
263
+ enhanced_pages,
264
+ original_pdf or "",
265
+ enhanced_pdf or "",
266
+ original_pages[0] if original_pages else None, # First page image
267
+ enhanced_pages[0] if enhanced_pages else None # First page image
268
+ )
269
+
270
+
271
+ def sync_enhanced_page_changes(
272
+ page_selector: str,
273
+ original_pages: List[str],
274
+ enhanced_pages: List[str],
275
+ original_pdf_path: str,
276
+ enhanced_pdf_path: str,
277
+ output_dir: str = None
278
+ ) -> Tuple[Optional[str], Optional[str], str]:
279
+ """
280
+ Synchronize page changes between original and enhanced PDFs and load page-specific content.
281
+
282
+ Args:
283
+ page_selector: Selected page identifier
284
+ original_pages: List of original page image paths
285
+ enhanced_pages: List of enhanced page image paths
286
+ original_pdf_path: Path to original PDF
287
+ enhanced_pdf_path: Path to enhanced PDF
288
+ output_dir: Output directory for page-specific content
289
+
290
+ Returns:
291
+ Tuple of (original_page_image, enhanced_page_image, page_content_html)
292
+ """
293
+ if not page_selector:
294
+ return None, None, ""
295
+
296
+ # Get the page index
297
+ try:
298
+ page_index = int(page_selector.split()[1]) - 1 # "Page 1" -> index 0
299
+ page_num = page_index + 1 # Convert back to 1-based page number
300
+ except (ValueError, IndexError):
301
+ return None, None, ""
302
+
303
+ # Get the corresponding page from each PDF
304
+ original_page = None
305
+ enhanced_page = None
306
+
307
+ if original_pages and 0 <= page_index < len(original_pages):
308
+ original_page = original_pages[page_index]
309
+
310
+ if enhanced_pages and 0 <= page_index < len(enhanced_pages):
311
+ enhanced_page = enhanced_pages[page_index]
312
+
313
+ # Load page-specific content
314
+ page_content_html = ""
315
+ if output_dir:
316
+ try:
317
+ # Look for page files in the pages folder
318
+ pages_dir = Path(output_dir) / "pages"
319
+ page_md_path = pages_dir / f"page_{page_num:03d}.md"
320
+ if page_md_path.exists():
321
+ with page_md_path.open("r", encoding="utf-8", errors="ignore") as f:
322
+ md_content = f.read()
323
+
324
+ # Convert markdown to HTML with embedded images
325
+ md_lines = md_content.split('\n')
326
+ page_content_html = create_page_html_content(md_lines, Path(output_dir))
327
+ else:
328
+ print(f"⚠️ Page {page_num} content file not found: {page_md_path}")
329
+ except Exception as e:
330
+ print(f"❌ Error loading page {page_num} content: {e}")
331
+
332
+ return original_page, enhanced_page, page_content_html
333
+
334
+
335
+ def create_enhanced_parser_tab() -> Tuple[gr.Tab, dict]:
336
+ """
337
+ Create the Enhanced Parser tab with all its components and functionality.
338
+
339
+ Returns:
340
+ Tuple of (tab_component, state_variables_dict)
341
+ """
342
+ with gr.Tab("Enhanced Parser") as tab:
343
+ # Input controls
344
+ with gr.Row():
345
+ pdf_enhanced = gr.File(file_types=[".pdf"], label="PDF")
346
+ use_image_restoration = gr.Checkbox(label="Use Image Restoration", value=True)
347
+ restoration_task = gr.Dropdown(
348
+ ["appearance", "dewarping", "deshadowing", "deblurring", "binarization", "end2end"],
349
+ value="appearance",
350
+ label="Restoration Task"
351
+ )
352
+ restoration_device = gr.Dropdown(
353
+ ["auto", "cuda", "cpu"],
354
+ value="auto",
355
+ label="Restoration Device"
356
+ )
357
+
358
+ # VLM settings
359
+ with gr.Row():
360
+ use_vlm_enhanced = gr.Checkbox(label="Use VLM (optional)", value=False)
361
+ vlm_provider_enhanced = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
362
+ vlm_api_key_enhanced = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
363
+
364
+ # Advanced settings accordion
365
+ with gr.Accordion("Advanced Settings", open=False):
366
+ with gr.Row():
367
+ restoration_dpi = gr.Slider(100, 400, value=200, step=10, label="Restoration DPI")
368
+ layout_model_enhanced = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
369
+ dpi_enhanced = gr.Slider(100, 400, value=200, step=10, label="Processing DPI")
370
+ min_score_enhanced = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
371
+
372
+ with gr.Row():
373
+ ocr_lang_enhanced = gr.Textbox(value="eng", label="OCR Language")
374
+ ocr_psm_enhanced = gr.Slider(0, 13, value=4, step=1, label="Tesseract PSM")
375
+ ocr_oem_enhanced = gr.Slider(0, 3, value=3, step=1, label="Tesseract OEM")
376
+
377
+ with gr.Row():
378
+ ocr_config_enhanced = gr.Textbox(value="", label="Extra OCR config")
379
+ box_sep_enhanced = gr.Textbox(value="\n", label="Box separator")
380
+
381
+ # Action button
382
+ run_enhanced_btn = gr.Button("▶ Run Enhanced Parse", variant="primary")
383
+ enhanced_status = gr.Textbox(label="Status", elem_classes=["status-ok"])
384
+
385
+ # Page selector for comparison
386
+ with gr.Row():
387
+ enhanced_page_selector = gr.Dropdown(label="Select Page for Comparison", interactive=True, visible=False)
388
+
389
+ # Before/After comparison
390
+ with gr.Row():
391
+ with gr.Column():
392
+ gr.Markdown("### 📄 Original PDF")
393
+ enhanced_original_pdf = gr.File(label="Original PDF File", interactive=False, visible=False)
394
+ enhanced_original_page_image = gr.Image(label="Original PDF Page", interactive=False, height=600)
395
+ with gr.Column():
396
+ gr.Markdown("### ✨ Enhanced PDF")
397
+ enhanced_enhanced_pdf = gr.File(label="Enhanced PDF File", interactive=False, visible=False)
398
+ enhanced_enhanced_page_image = gr.Image(label="Enhanced PDF Page", interactive=False, height=600)
399
+
400
+ # Content display
401
+ with gr.Row():
402
+ enhanced_md_preview = gr.HTML(label="Extracted Content", visible=True, elem_classes=["page-content"])
403
+
404
+ # Downloads
405
+ enhanced_files_out = gr.Files(label="Download individual output files")
406
+ enhanced_zip_out = gr.File(label="Download all outputs (ZIP)")
407
+
408
+ # State variables for PDF page data
409
+ enhanced_original_pages_state = gr.State([])
410
+ enhanced_enhanced_pages_state = gr.State([])
411
+ enhanced_original_pdf_path_state = gr.State("")
412
+ enhanced_enhanced_pdf_path_state = gr.State("")
413
+ enhanced_output_dir_state = gr.State("")
414
+
415
+ # Event handlers
416
+ run_enhanced_btn.click(
417
+ fn=run_enhanced_parse,
418
+ inputs=[
419
+ pdf_enhanced, use_image_restoration, restoration_task, restoration_device, restoration_dpi,
420
+ use_vlm_enhanced, vlm_provider_enhanced, vlm_api_key_enhanced, layout_model_enhanced,
421
+ dpi_enhanced, min_score_enhanced, ocr_lang_enhanced, ocr_psm_enhanced, ocr_oem_enhanced,
422
+ ocr_config_enhanced, box_sep_enhanced
423
+ ],
424
+ outputs=[
425
+ enhanced_status, enhanced_md_preview, enhanced_files_out, enhanced_zip_out,
426
+ enhanced_original_pdf, enhanced_enhanced_pdf, enhanced_output_dir_state
427
+ ]
428
+ ).then(
429
+ fn=update_enhanced_page_selector,
430
+ inputs=[enhanced_original_pdf, enhanced_enhanced_pdf],
431
+ outputs=[
432
+ enhanced_page_selector, enhanced_original_pages_state, enhanced_enhanced_pages_state,
433
+ enhanced_original_pdf_path_state, enhanced_enhanced_pdf_path_state,
434
+ enhanced_original_page_image, enhanced_enhanced_page_image
435
+ ]
436
+ )
437
+
438
+ # Handle page selector changes
439
+ enhanced_page_selector.change(
440
+ fn=sync_enhanced_page_changes,
441
+ inputs=[
442
+ enhanced_page_selector, enhanced_original_pages_state, enhanced_enhanced_pages_state,
443
+ enhanced_original_pdf_path_state, enhanced_enhanced_pdf_path_state, enhanced_output_dir_state
444
+ ],
445
+ outputs=[enhanced_original_page_image, enhanced_enhanced_page_image, enhanced_md_preview]
446
+ )
447
+
448
+ # Return state variables for external access
449
+ state_vars = {
450
+ 'pdf_enhanced': pdf_enhanced,
451
+ 'use_image_restoration': use_image_restoration,
452
+ 'restoration_task': restoration_task,
453
+ 'restoration_device': restoration_device,
454
+ 'restoration_dpi': restoration_dpi,
455
+ 'use_vlm_enhanced': use_vlm_enhanced,
456
+ 'vlm_provider_enhanced': vlm_provider_enhanced,
457
+ 'vlm_api_key_enhanced': vlm_api_key_enhanced,
458
+ 'layout_model_enhanced': layout_model_enhanced,
459
+ 'dpi_enhanced': dpi_enhanced,
460
+ 'min_score_enhanced': min_score_enhanced,
461
+ 'ocr_lang_enhanced': ocr_lang_enhanced,
462
+ 'ocr_psm_enhanced': ocr_psm_enhanced,
463
+ 'ocr_oem_enhanced': ocr_oem_enhanced,
464
+ 'ocr_config_enhanced': ocr_config_enhanced,
465
+ 'box_sep_enhanced': box_sep_enhanced,
466
+ 'run_enhanced_btn': run_enhanced_btn,
467
+ 'enhanced_status': enhanced_status,
468
+ 'enhanced_page_selector': enhanced_page_selector,
469
+ 'enhanced_original_pdf': enhanced_original_pdf,
470
+ 'enhanced_original_page_image': enhanced_original_page_image,
471
+ 'enhanced_enhanced_pdf': enhanced_enhanced_pdf,
472
+ 'enhanced_enhanced_page_image': enhanced_enhanced_page_image,
473
+ 'enhanced_md_preview': enhanced_md_preview,
474
+ 'enhanced_files_out': enhanced_files_out,
475
+ 'enhanced_zip_out': enhanced_zip_out,
476
+ 'enhanced_original_pages_state': enhanced_original_pages_state,
477
+ 'enhanced_enhanced_pages_state': enhanced_enhanced_pages_state,
478
+ 'enhanced_original_pdf_path_state': enhanced_original_pdf_path_state,
479
+ 'enhanced_enhanced_pdf_path_state': enhanced_enhanced_pdf_path_state,
480
+ 'enhanced_output_dir_state': enhanced_output_dir_state
481
+ }
482
+
483
+ return tab, state_vars