doctra 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,539 @@
1
+ """
2
+ Full Parse UI Module
3
+
4
+ This module contains all functionality for the Full Parse tab in the Doctra Gradio interface.
5
+ It handles PDF parsing, markdown rendering, page navigation, and image display.
6
+ """
7
+
8
+ import tempfile
9
+ import traceback
10
+ from pathlib import Path
11
+ from typing import Tuple, List, Optional
12
+
13
+ import gradio as gr
14
+
15
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
16
+ from doctra.utils.pdf_io import render_pdf_to_images
17
+ from doctra.ui.ui_helpers import (
18
+ gather_outputs,
19
+ parse_markdown_by_pages,
20
+ validate_vlm_config,
21
+ create_page_html_content
22
+ )
23
+
24
+
25
+ def run_full_parse(
26
+ pdf_file: str,
27
+ use_vlm: bool,
28
+ vlm_provider: str,
29
+ vlm_api_key: str,
30
+ layout_model_name: str,
31
+ dpi: int,
32
+ min_score: float,
33
+ ocr_lang: str,
34
+ ocr_psm: int,
35
+ ocr_oem: int,
36
+ ocr_extra_config: str,
37
+ box_separator: str,
38
+ ) -> Tuple[str, Optional[str], List[tuple[str, str]], List[str], str]:
39
+ """
40
+ Run full PDF parsing with structured output.
41
+
42
+ Args:
43
+ pdf_file: Path to input PDF file
44
+ use_vlm: Whether to use Vision Language Model
45
+ vlm_provider: VLM provider name
46
+ vlm_api_key: API key for VLM provider
47
+ layout_model_name: Layout detection model name
48
+ dpi: DPI for image processing
49
+ min_score: Minimum confidence score for layout detection
50
+ ocr_lang: OCR language code
51
+ ocr_psm: Tesseract PSM mode
52
+ ocr_oem: Tesseract OEM mode
53
+ ocr_extra_config: Additional OCR configuration
54
+ box_separator: Separator for bounding boxes
55
+
56
+ Returns:
57
+ Tuple of (status_message, markdown_preview, gallery_items, file_paths, zip_path)
58
+ """
59
+ if not pdf_file:
60
+ return ("No file provided.", None, [], [], "")
61
+
62
+ # Validate VLM configuration
63
+ vlm_error = validate_vlm_config(use_vlm, vlm_api_key)
64
+ if vlm_error:
65
+ return (vlm_error, None, [], [], "")
66
+
67
+ original_filename = Path(pdf_file).stem
68
+
69
+ # Create temporary directory for processing
70
+ tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
71
+ input_pdf = tmp_dir / f"{original_filename}.pdf"
72
+ import shutil
73
+ shutil.copy2(pdf_file, input_pdf)
74
+
75
+ # Initialize parser with configuration
76
+ parser = StructuredPDFParser(
77
+ use_vlm=use_vlm,
78
+ vlm_provider=vlm_provider,
79
+ vlm_api_key=vlm_api_key or None,
80
+ layout_model_name=layout_model_name,
81
+ dpi=int(dpi),
82
+ min_score=float(min_score),
83
+ ocr_lang=ocr_lang,
84
+ ocr_psm=int(ocr_psm),
85
+ ocr_oem=int(ocr_oem),
86
+ ocr_extra_config=ocr_extra_config or "",
87
+ box_separator=box_separator or "\n",
88
+ )
89
+
90
+ try:
91
+ parser.parse(str(input_pdf))
92
+ except Exception as e:
93
+ traceback.print_exc()
94
+ # Safely encode error message for return value
95
+ try:
96
+ error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
97
+ return (f"❌ VLM processing failed: {error_msg}", None, [], [], "")
98
+ except Exception:
99
+ return (f"❌ VLM processing failed: <Unicode encoding error>", None, [], [], "")
100
+
101
+ # Find output directory
102
+ outputs_root = Path("outputs")
103
+ out_dir = outputs_root / original_filename / "full_parse"
104
+ if not out_dir.exists():
105
+ # fallback: search latest created dir under outputs
106
+ candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
107
+ if candidates:
108
+ out_dir = candidates[0] / "full_parse"
109
+ else:
110
+ out_dir = outputs_root
111
+
112
+ # Read markdown file if it exists
113
+ md_file = next(out_dir.glob("*.md"), None)
114
+ md_preview = None
115
+ if md_file and md_file.exists():
116
+ try:
117
+ with md_file.open("r", encoding="utf-8", errors="ignore") as f:
118
+ md_preview = f.read()
119
+ except Exception:
120
+ md_preview = None
121
+
122
+ # Gather output files and create ZIP
123
+ gallery_items, file_paths, zip_path = gather_outputs(
124
+ out_dir,
125
+ zip_filename=original_filename,
126
+ is_structured_parsing=False
127
+ )
128
+
129
+ return (
130
+ f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}",
131
+ md_preview,
132
+ gallery_items,
133
+ file_paths,
134
+ zip_path
135
+ )
136
+
137
+
138
+ def parse_markdown_by_pages_simple(md_content: str) -> List[dict]:
139
+ """
140
+ Parse markdown content and organize it by pages (simplified version).
141
+
142
+ Args:
143
+ md_content: Raw markdown content string
144
+
145
+ Returns:
146
+ List of page dictionaries with content
147
+ """
148
+ pages = []
149
+ current_page = None
150
+
151
+ lines = md_content.split('\n')
152
+ i = 0
153
+
154
+ # First, find all page headers
155
+ page_headers = []
156
+ for i, line in enumerate(lines):
157
+ if line.strip().startswith('## Page '):
158
+ page_num = line.strip().replace('## Page ', '').strip()
159
+ page_headers.append((i, page_num, line))
160
+
161
+ # Parse content for each page
162
+ for i, (line_idx, page_num, header_line) in enumerate(page_headers):
163
+ # Find the end of this page (start of next page or end of document)
164
+ start_line = line_idx
165
+ if i + 1 < len(page_headers):
166
+ end_line = page_headers[i + 1][0]
167
+ else:
168
+ end_line = len(lines)
169
+
170
+ # Extract content for this page
171
+ page_content = lines[start_line:end_line]
172
+
173
+ page = {
174
+ 'page_num': page_num,
175
+ 'content': page_content
176
+ }
177
+ pages.append(page)
178
+
179
+ return pages
180
+
181
+
182
+ def update_page_selector(pages_data: List[dict]) -> gr.Dropdown:
183
+ """
184
+ Update the page selector dropdown with available pages.
185
+
186
+ Args:
187
+ pages_data: List of page data dictionaries
188
+
189
+ Returns:
190
+ Updated dropdown component
191
+ """
192
+ if not pages_data:
193
+ return gr.Dropdown(choices=[], value=None, visible=False)
194
+
195
+ page_choices = [f"Page {page['page_num']}" for page in pages_data]
196
+ return gr.Dropdown(choices=page_choices, value=page_choices[0], visible=True)
197
+
198
+
199
+ def display_selected_page(
200
+ selected_page: str,
201
+ pages_data: List[dict],
202
+ pdf_path: str,
203
+ page_images: List[str]
204
+ ) -> Tuple[str, Optional[str]]:
205
+ """
206
+ Display the content of the selected page and the rendered page image.
207
+
208
+ Args:
209
+ selected_page: Selected page identifier
210
+ pages_data: List of page data dictionaries
211
+ pdf_path: Path to the original PDF file
212
+ page_images: List of page image file paths
213
+
214
+ Returns:
215
+ Tuple of (html_content, page_image_path)
216
+ """
217
+ if not selected_page or not pages_data:
218
+ return "", None
219
+
220
+ # Find the selected page
221
+ page_num = selected_page.replace("Page ", "")
222
+ page = next((p for p in pages_data if p['page_num'] == page_num), None)
223
+
224
+ if not page:
225
+ return "Page not found", None
226
+
227
+ # Build HTML with inline base64 images and proper formatting
228
+ base_dir = None
229
+ try:
230
+ stem = Path(pdf_path).stem if pdf_path else ""
231
+ if stem:
232
+ base_dir = Path("outputs") / stem / "full_parse"
233
+ except Exception:
234
+ base_dir = None
235
+
236
+ content = create_page_html_content(page['content'], base_dir)
237
+
238
+ # Ensure page images are prepared
239
+ try:
240
+ if pdf_path and not page_images:
241
+ tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
242
+ pil_pages = render_pdf_to_images(pdf_path)
243
+ saved_paths: List[str] = []
244
+ for idx, (im, _, _) in enumerate(pil_pages, start=1):
245
+ out_path = tmp_img_dir / f"page_{idx:03d}.jpg"
246
+ im.save(out_path, format="JPEG", quality=90)
247
+ saved_paths.append(str(out_path))
248
+ page_images = saved_paths
249
+ except Exception:
250
+ pass
251
+
252
+ # Select image for the current page number (1-based)
253
+ page_img = None
254
+ try:
255
+ page_index = int(page_num)
256
+ if page_images and 1 <= page_index <= len(page_images):
257
+ page_img = page_images[page_index - 1]
258
+ except Exception:
259
+ page_img = None
260
+
261
+ return content, page_img
262
+
263
+
264
+ def filter_gallery_by_image(img_path: str, caption: str, all_images: List[tuple]) -> List[tuple]:
265
+ """
266
+ Filter gallery to show only the selected image.
267
+
268
+ Args:
269
+ img_path: Path to the selected image
270
+ caption: Caption of the selected image
271
+ all_images: List of all available images
272
+
273
+ Returns:
274
+ Filtered list of images
275
+ """
276
+ if not img_path or not all_images:
277
+ return all_images
278
+
279
+ # Find the selected image
280
+ filtered_images = []
281
+ for stored_img_path, stored_caption in all_images:
282
+ if stored_caption == caption:
283
+ filtered_images.append((stored_img_path, stored_caption))
284
+ break
285
+
286
+ return filtered_images
287
+
288
+
289
+ def trigger_image_filter(filter_input: str) -> Tuple[str, str]:
290
+ """
291
+ Trigger image filtering when input changes.
292
+
293
+ Args:
294
+ filter_input: Input string in format "img_path|caption"
295
+
296
+ Returns:
297
+ Tuple of (img_path, caption)
298
+ """
299
+ if not filter_input:
300
+ return "", ""
301
+
302
+ # Parse the input (format: "img_path|caption")
303
+ parts = filter_input.split("|", 1)
304
+ if len(parts) == 2:
305
+ img_path, caption = parts
306
+ return img_path, caption
307
+ return "", ""
308
+
309
+
310
+ def filter_gallery_by_trigger(
311
+ img_path: str,
312
+ caption: str,
313
+ all_images: List[tuple]
314
+ ) -> List[tuple]:
315
+ """
316
+ Filter gallery based on trigger values.
317
+
318
+ Args:
319
+ img_path: Path to the selected image
320
+ caption: Caption of the selected image
321
+ all_images: List of all available images
322
+
323
+ Returns:
324
+ Filtered list of images
325
+ """
326
+ if not img_path or not caption or not all_images:
327
+ return all_images
328
+
329
+ # Find the selected image
330
+ filtered_images = []
331
+ for stored_img_path, stored_caption in all_images:
332
+ if stored_caption == caption:
333
+ filtered_images.append((stored_img_path, stored_caption))
334
+ break
335
+
336
+ return filtered_images
337
+
338
+
339
+ def run_full_parse_with_pages(*args) -> Tuple[str, str, Optional[str], List[tuple], List[str], str, List[dict], List[tuple], str, List[str]]:
340
+ """
341
+ Run full parse and parse the markdown into pages with enhanced functionality.
342
+
343
+ Args:
344
+ *args: All input arguments for run_full_parse
345
+
346
+ Returns:
347
+ Tuple of (status_msg, first_page_content, first_page_image, gallery_items, file_paths, zip_path, pages_data, all_images, input_pdf_path, saved_paths)
348
+ """
349
+ result = run_full_parse(*args)
350
+ status_msg, md_content, gallery_items, file_paths, zip_path = result
351
+
352
+ # Parse markdown into pages
353
+ pages_data = []
354
+ first_page_content = ""
355
+ all_images = []
356
+
357
+ if md_content:
358
+ pages_data = parse_markdown_by_pages_simple(md_content)
359
+
360
+ # Collect all images from all pages
361
+ for page in pages_data:
362
+ for line in page['content']:
363
+ if line.strip().startswith('![') and ('](images/' in line or '](images\\' in line):
364
+ import re
365
+ match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line.strip())
366
+ if match:
367
+ caption = match.group(1)
368
+ img_path = match.group(2)
369
+ all_images.append((img_path, caption))
370
+
371
+ # Show only Page 1 content initially
372
+ if pages_data:
373
+ first_page = pages_data[0]
374
+ first_page_content = "\n".join(first_page['content'])
375
+
376
+ # Prepare first page image immediately and cache page images
377
+ input_pdf_path = args[0]
378
+ first_page_image = None
379
+ saved_paths: List[str] = []
380
+
381
+ try:
382
+ if input_pdf_path:
383
+ tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
384
+ pil_pages = render_pdf_to_images(input_pdf_path)
385
+ for idx, (im, _, _) in enumerate(pil_pages, start=1):
386
+ out_path = tmp_img_dir / f"page_{idx:03d}.jpg"
387
+ im.save(out_path, format="JPEG", quality=90)
388
+ saved_paths.append(str(out_path))
389
+ if saved_paths:
390
+ first_page_image = saved_paths[0]
391
+ except Exception:
392
+ pass
393
+
394
+ # Build initial HTML with inline images and proper blocks for first page
395
+ if pages_data:
396
+ base_dir = None
397
+ try:
398
+ stem = Path(input_pdf_path).stem if input_pdf_path else ""
399
+ if stem:
400
+ base_dir = Path("outputs") / stem / "full_parse"
401
+ except Exception:
402
+ base_dir = None
403
+
404
+ first_page_content = create_page_html_content(pages_data[0]['content'], base_dir)
405
+
406
+ return (
407
+ status_msg,
408
+ first_page_content,
409
+ first_page_image,
410
+ gallery_items,
411
+ file_paths,
412
+ zip_path,
413
+ pages_data,
414
+ all_images,
415
+ input_pdf_path,
416
+ saved_paths
417
+ )
418
+
419
+
420
+ def create_full_parse_tab() -> Tuple[gr.Tab, dict]:
421
+ """
422
+ Create the Full Parse tab with all its components and functionality.
423
+
424
+ Returns:
425
+ Tuple of (tab_component, state_variables_dict)
426
+ """
427
+ with gr.Tab("Full Parse") as tab:
428
+ # Input controls
429
+ with gr.Row():
430
+ pdf = gr.File(file_types=[".pdf"], label="PDF")
431
+ use_vlm = gr.Checkbox(label="Use VLM (optional)", value=False)
432
+ vlm_provider = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
433
+ vlm_api_key = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
434
+
435
+ # Advanced settings accordion
436
+ with gr.Accordion("Advanced", open=False):
437
+ with gr.Row():
438
+ layout_model = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
439
+ dpi = gr.Slider(100, 400, value=200, step=10, label="DPI")
440
+ min_score = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
441
+ with gr.Row():
442
+ ocr_lang = gr.Textbox(value="eng", label="OCR Language")
443
+ ocr_psm = gr.Slider(0, 13, value=4, step=1, label="Tesseract PSM")
444
+ ocr_oem = gr.Slider(0, 3, value=3, step=1, label="Tesseract OEM")
445
+ with gr.Row():
446
+ ocr_config = gr.Textbox(value="", label="Extra OCR config")
447
+ box_sep = gr.Textbox(value="\n", label="Box separator")
448
+
449
+ # Action button
450
+ run_btn = gr.Button("▶ Run Full Parse", variant="primary")
451
+ status = gr.Textbox(label="Status", elem_classes=["status-ok"])
452
+
453
+ # Page selector for extracted content
454
+ page_selector = gr.Dropdown(label="Select Page to Display", interactive=True, visible=False)
455
+
456
+ # Content display
457
+ with gr.Row():
458
+ with gr.Column():
459
+ md_preview = gr.HTML(label="Extracted Content", visible=True, elem_classes=["page-content"])
460
+ with gr.Column():
461
+ page_image = gr.Image(label="Page image", interactive=False)
462
+
463
+ # Gallery and downloads
464
+ gallery = gr.Gallery(label="Extracted images (tables/charts/figures)", columns=4, height=420, preview=True)
465
+ files_out = gr.Files(label="Download individual output files")
466
+ zip_out = gr.File(label="Download all outputs (ZIP)")
467
+
468
+ # State variables for managing page data and images
469
+ pages_state = gr.State([])
470
+ all_images_state = gr.State([])
471
+ pdf_path_state = gr.State("")
472
+ page_images_state = gr.State([]) # list of file paths per page index (1-based)
473
+
474
+ # Hidden components for image filtering
475
+ filter_trigger = gr.Button(visible=False)
476
+ current_image_path = gr.State("")
477
+ current_image_caption = gr.State("")
478
+ image_filter_input = gr.Textbox(visible=False, elem_id="image_filter_input")
479
+
480
+ # Event handlers
481
+ run_btn.click(
482
+ fn=run_full_parse_with_pages,
483
+ inputs=[pdf, use_vlm, vlm_provider, vlm_api_key, layout_model, dpi, min_score, ocr_lang, ocr_psm, ocr_oem, ocr_config, box_sep],
484
+ outputs=[status, md_preview, page_image, gallery, files_out, zip_out, pages_state, all_images_state, pdf_path_state, page_images_state],
485
+ ).then(
486
+ fn=update_page_selector,
487
+ inputs=[pages_state],
488
+ outputs=[page_selector],
489
+ )
490
+
491
+ page_selector.change(
492
+ fn=display_selected_page,
493
+ inputs=[page_selector, pages_state, pdf_path_state, page_images_state],
494
+ outputs=[md_preview, page_image],
495
+ )
496
+
497
+ image_filter_input.change(
498
+ fn=trigger_image_filter,
499
+ inputs=[image_filter_input],
500
+ outputs=[current_image_path, current_image_caption],
501
+ ).then(
502
+ fn=filter_gallery_by_trigger,
503
+ inputs=[current_image_path, current_image_caption, all_images_state],
504
+ outputs=[gallery],
505
+ )
506
+
507
+ # Return state variables for external access
508
+ state_vars = {
509
+ 'pdf': pdf,
510
+ 'use_vlm': use_vlm,
511
+ 'vlm_provider': vlm_provider,
512
+ 'vlm_api_key': vlm_api_key,
513
+ 'layout_model': layout_model,
514
+ 'dpi': dpi,
515
+ 'min_score': min_score,
516
+ 'ocr_lang': ocr_lang,
517
+ 'ocr_psm': ocr_psm,
518
+ 'ocr_oem': ocr_oem,
519
+ 'ocr_config': ocr_config,
520
+ 'box_sep': box_sep,
521
+ 'run_btn': run_btn,
522
+ 'status': status,
523
+ 'page_selector': page_selector,
524
+ 'md_preview': md_preview,
525
+ 'page_image': page_image,
526
+ 'gallery': gallery,
527
+ 'files_out': files_out,
528
+ 'zip_out': zip_out,
529
+ 'pages_state': pages_state,
530
+ 'all_images_state': all_images_state,
531
+ 'pdf_path_state': pdf_path_state,
532
+ 'page_images_state': page_images_state,
533
+ 'filter_trigger': filter_trigger,
534
+ 'current_image_path': current_image_path,
535
+ 'current_image_caption': current_image_caption,
536
+ 'image_filter_input': image_filter_input
537
+ }
538
+
539
+ return tab, state_vars