doctra 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
doctra/ui/app.py ADDED
@@ -0,0 +1,1012 @@
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Optional, Tuple, List, Dict, Any
7
+
8
+ import gradio as gr
9
+
10
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
11
+ from doctra.parsers.table_chart_extractor import ChartTablePDFParser
12
+
13
+
14
+ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zip_filename: Optional[str] = None, is_structured_parsing: bool = False) -> Tuple[List[tuple[str, str]], List[str], str]:
15
+ gallery_items: List[tuple[str, str]] = []
16
+ file_paths: List[str] = []
17
+
18
+ if out_dir.exists():
19
+ if is_structured_parsing:
20
+ # For structured parsing, show ALL files in the directory
21
+ for file_path in sorted(out_dir.rglob("*")):
22
+ if file_path.is_file():
23
+ file_paths.append(str(file_path))
24
+ else:
25
+ # For full parsing, use the original logic
26
+ # Always add main output files (HTML, Markdown, etc.) regardless of allowed_kinds
27
+ main_files = [
28
+ "result.html",
29
+ "result.md",
30
+ "tables.html",
31
+ "tables.xlsx"
32
+ ]
33
+
34
+ for main_file in main_files:
35
+ file_path = out_dir / main_file
36
+ if file_path.exists():
37
+ file_paths.append(str(file_path))
38
+
39
+ # Add image files based on allowed_kinds or all images if not specified
40
+ if allowed_kinds:
41
+ for kind in allowed_kinds:
42
+ # ChartTablePDFParser saves directly to charts/ and tables/ directories
43
+ p = out_dir / kind
44
+ if p.exists():
45
+ for img in sorted(p.glob("*.png")): # ChartTablePDFParser saves as .png
46
+ file_paths.append(str(img))
47
+
48
+ # Also check images/ subdirectories (for StructuredPDFParser)
49
+ images_dir = out_dir / "images" / kind
50
+ if images_dir.exists():
51
+ for img in sorted(images_dir.glob("*.jpg")): # StructuredPDFParser saves as .jpg
52
+ file_paths.append(str(img))
53
+ else:
54
+ # Fallback: look in both direct directories and images/ subdirectories
55
+ for p in (out_dir / "charts").glob("*.png"):
56
+ file_paths.append(str(p))
57
+ for p in (out_dir / "tables").glob("*.png"):
58
+ file_paths.append(str(p))
59
+ for p in (out_dir / "images").rglob("*.jpg"):
60
+ file_paths.append(str(p))
61
+
62
+ # Add Excel files based on extraction target (for structured parsing)
63
+ if allowed_kinds:
64
+ if "charts" in allowed_kinds and "tables" in allowed_kinds:
65
+ excel_files = ["parsed_tables_charts.xlsx"]
66
+ elif "charts" in allowed_kinds:
67
+ excel_files = ["parsed_charts.xlsx"]
68
+ elif "tables" in allowed_kinds:
69
+ excel_files = ["parsed_tables.xlsx"]
70
+ else:
71
+ excel_files = []
72
+
73
+ for excel_file in excel_files:
74
+ excel_path = out_dir / excel_file
75
+ if excel_path.exists():
76
+ file_paths.append(str(excel_path))
77
+
78
+ kinds = allowed_kinds if allowed_kinds else ["tables", "charts", "figures"]
79
+ for sub in kinds:
80
+ # Look in both direct directories and images/ subdirectories
81
+ # First try direct directories (for ChartTablePDFParser)
82
+ p = out_dir / sub
83
+ if p.exists():
84
+ for img in sorted(p.glob("*.png")): # ChartTablePDFParser saves as .png
85
+ gallery_items.append((str(img), f"{sub}: {img.name}"))
86
+
87
+ # Also try images/ subdirectories (for StructuredPDFParser)
88
+ images_dir = out_dir / "images" / sub
89
+ if images_dir.exists():
90
+ for img in sorted(images_dir.glob("*.jpg")): # StructuredPDFParser saves as .jpg
91
+ gallery_items.append((str(img), f"{sub}: {img.name}"))
92
+
93
+ tmp_zip_dir = Path(tempfile.mkdtemp(prefix="doctra_zip_"))
94
+
95
+ # Use custom filename if provided, otherwise use default
96
+ if zip_filename:
97
+ # Clean the filename to be safe for file systems
98
+ safe_filename = re.sub(r'[<>:"/\\|?*]', '_', zip_filename)
99
+ zip_base = tmp_zip_dir / safe_filename
100
+ else:
101
+ zip_base = tmp_zip_dir / "doctra_outputs"
102
+
103
+ # Create a filtered copy of the output directory excluding temp files
104
+ filtered_dir = tmp_zip_dir / "filtered_outputs"
105
+ shutil.copytree(out_dir, filtered_dir, ignore=shutil.ignore_patterns('~$*', '*.tmp', '*.temp'))
106
+
107
+ zip_path = shutil.make_archive(str(zip_base), 'zip', root_dir=str(filtered_dir))
108
+
109
+ return gallery_items, file_paths, zip_path
110
+
111
+
112
+ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
113
+ """
114
+ Parse markdown content and organize it by pages.
115
+ Returns a list of page dictionaries with content, tables, charts, and figures.
116
+ """
117
+ import re
118
+
119
+ pages = []
120
+ current_page = None
121
+
122
+ lines = md_content.split('\n')
123
+ i = 0
124
+
125
+ while i < len(lines):
126
+ line = lines[i].strip()
127
+
128
+ # Check for page header
129
+ if line.startswith('## Page '):
130
+ # Save previous page if exists
131
+ if current_page:
132
+ pages.append(current_page)
133
+
134
+ # Start new page
135
+ page_num = line.replace('## Page ', '').strip()
136
+ current_page = {
137
+ 'page_num': page_num,
138
+ 'content': [],
139
+ 'tables': [],
140
+ 'charts': [],
141
+ 'figures': [],
142
+ 'images': [],
143
+ 'full_content': [] # Store full content with inline images
144
+ }
145
+ i += 1
146
+ continue
147
+
148
+ # Check for images (tables, charts, figures)
149
+ if line.startswith('![') and '](images/' in line:
150
+ # Extract image info
151
+ match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
152
+ if match:
153
+ caption = match.group(1)
154
+ img_path = match.group(2)
155
+
156
+ # Categorize by type
157
+ if 'Table' in caption:
158
+ current_page['tables'].append({'caption': caption, 'path': img_path})
159
+ elif 'Chart' in caption:
160
+ current_page['charts'].append({'caption': caption, 'path': img_path})
161
+ elif 'Figure' in caption:
162
+ current_page['figures'].append({'caption': caption, 'path': img_path})
163
+
164
+ current_page['images'].append({'caption': caption, 'path': img_path})
165
+
166
+ # Add to full content with proper markdown formatting
167
+ current_page['full_content'].append(f"![{caption}]({img_path})")
168
+
169
+ # Regular content
170
+ elif current_page:
171
+ if line: # Only add non-empty lines
172
+ current_page['content'].append(line)
173
+ current_page['full_content'].append(line)
174
+
175
+ i += 1
176
+
177
+ # Add the last page
178
+ if current_page:
179
+ pages.append(current_page)
180
+
181
+ return pages
182
+
183
+
184
+ def run_full_parse(
185
+ pdf_file: str,
186
+ use_vlm: bool,
187
+ vlm_provider: str,
188
+ vlm_api_key: str,
189
+ layout_model_name: str,
190
+ dpi: int,
191
+ min_score: float,
192
+ ocr_lang: str,
193
+ ocr_psm: int,
194
+ ocr_oem: int,
195
+ ocr_extra_config: str,
196
+ box_separator: str,
197
+ ) -> Tuple[str, Optional[str], List[tuple[str, str]], List[str], str]:
198
+ if not pdf_file:
199
+ return ("No file provided.", None, [], [], "")
200
+
201
+ # Extract filename from the uploaded file path
202
+ # Gradio provides the original filename in the file path
203
+ original_filename = Path(pdf_file).stem
204
+
205
+ tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
206
+ # Use original filename for temp file so parser creates correct output directory
207
+ input_pdf = tmp_dir / f"{original_filename}.pdf"
208
+ shutil.copy2(pdf_file, input_pdf)
209
+
210
+ # Validate VLM configuration
211
+ if use_vlm and not vlm_api_key:
212
+ return ("❌ Error: VLM API key is required when using VLM", None, [], [], "")
213
+
214
+ if use_vlm and vlm_api_key:
215
+ # Basic API key validation
216
+ if len(vlm_api_key.strip()) < 10:
217
+ return ("❌ Error: VLM API key appears to be too short or invalid", None, [], [], "")
218
+ if vlm_api_key.strip().startswith('sk-') and len(vlm_api_key.strip()) < 20:
219
+ return ("❌ Error: OpenAI API key appears to be invalid (too short)", None, [], [], "")
220
+
221
+ parser = StructuredPDFParser(
222
+ use_vlm=use_vlm,
223
+ vlm_provider=vlm_provider,
224
+ vlm_api_key=vlm_api_key or None,
225
+ layout_model_name=layout_model_name,
226
+ dpi=int(dpi),
227
+ min_score=float(min_score),
228
+ ocr_lang=ocr_lang,
229
+ ocr_psm=int(ocr_psm),
230
+ ocr_oem=int(ocr_oem),
231
+ ocr_extra_config=ocr_extra_config or "",
232
+ box_separator=box_separator or "\n",
233
+ )
234
+
235
+ try:
236
+ parser.parse(str(input_pdf))
237
+ except Exception as e:
238
+ import traceback
239
+ traceback.print_exc()
240
+ # Safely encode error message for return value
241
+ try:
242
+ error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
243
+ return (f"❌ VLM processing failed: {error_msg}", None, [], [], "")
244
+ except Exception:
245
+ return (f"❌ VLM processing failed: <Unicode encoding error>", None, [], [], "")
246
+
247
+ outputs_root = Path("outputs")
248
+ out_dir = outputs_root / original_filename / "full_parse"
249
+ if not out_dir.exists():
250
+ # fallback: search latest created dir under outputs
251
+ candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
252
+ if candidates:
253
+ out_dir = candidates[0] / "full_parse"
254
+ else:
255
+ out_dir = outputs_root
256
+
257
+ md_file = next(out_dir.glob("*.md"), None)
258
+ md_preview = None
259
+ if md_file and md_file.exists():
260
+ try:
261
+ with md_file.open("r", encoding="utf-8", errors="ignore") as f:
262
+ md_preview = f.read() # Return the full markdown content
263
+ except Exception:
264
+ md_preview = None
265
+
266
+ gallery_items, file_paths, zip_path = _gather_outputs(out_dir, zip_filename=original_filename, is_structured_parsing=False)
267
+ return (f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}", md_preview, gallery_items, file_paths, zip_path)
268
+
269
+
270
+ def run_extract(
271
+ pdf_file: str,
272
+ target: str,
273
+ use_vlm: bool,
274
+ vlm_provider: str,
275
+ vlm_api_key: str,
276
+ layout_model_name: str,
277
+ dpi: int,
278
+ min_score: float,
279
+ ) -> Tuple[str, str, List[tuple[str, str]], List[str], str]:
280
+ if not pdf_file:
281
+ return ("No file provided.", "", [], [], "")
282
+
283
+ # Validate VLM configuration
284
+ if use_vlm and not vlm_api_key:
285
+ return ("❌ Error: VLM API key is required when using VLM", "", [], [], "")
286
+
287
+ if use_vlm and vlm_api_key:
288
+ # Basic API key validation
289
+ if len(vlm_api_key.strip()) < 10:
290
+ return ("❌ Error: VLM API key appears to be too short or invalid", "", [], [], "")
291
+ if vlm_api_key.strip().startswith('sk-') and len(vlm_api_key.strip()) < 20:
292
+ return ("❌ Error: OpenAI API key appears to be invalid (too short)", "", [], [], "")
293
+
294
+ # Extract filename from the uploaded file path
295
+ original_filename = Path(pdf_file).stem
296
+
297
+ tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
298
+ # Use original filename for temp file so parser creates correct output directory
299
+ input_pdf = tmp_dir / f"{original_filename}.pdf"
300
+ shutil.copy2(pdf_file, input_pdf)
301
+
302
+ parser = ChartTablePDFParser(
303
+ extract_charts=(target in ("charts", "both")),
304
+ extract_tables=(target in ("tables", "both")),
305
+ use_vlm=use_vlm,
306
+ vlm_provider=vlm_provider,
307
+ vlm_api_key=vlm_api_key or None,
308
+ layout_model_name=layout_model_name,
309
+ dpi=int(dpi),
310
+ min_score=float(min_score),
311
+ )
312
+
313
+ output_base = Path("outputs")
314
+ parser.parse(str(input_pdf), str(output_base))
315
+
316
+ outputs_root = output_base
317
+ out_dir = outputs_root / original_filename / "structured_parsing"
318
+ if not out_dir.exists():
319
+ if outputs_root.exists():
320
+ candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
321
+ if candidates:
322
+ out_dir = candidates[0] / "structured_parsing"
323
+ else:
324
+ out_dir = outputs_root
325
+ else:
326
+ outputs_root.mkdir(parents=True, exist_ok=True)
327
+ out_dir = outputs_root
328
+
329
+ # Determine which kinds to include in outputs based on target selection
330
+ allowed_kinds: Optional[List[str]] = None
331
+ if target in ("tables", "charts"):
332
+ allowed_kinds = [target]
333
+ elif target == "both":
334
+ allowed_kinds = ["tables", "charts"]
335
+
336
+ gallery_items, file_paths, zip_path = _gather_outputs(out_dir, allowed_kinds, zip_filename=original_filename, is_structured_parsing=True)
337
+
338
+ # Build tables HTML preview from Excel data (when VLM enabled)
339
+ tables_html = ""
340
+ try:
341
+ if use_vlm:
342
+ # Find Excel file based on target
343
+ excel_filename = None
344
+ if target in ("tables", "charts"):
345
+ if target == "tables":
346
+ excel_filename = "parsed_tables.xlsx"
347
+ else: # charts
348
+ excel_filename = "parsed_charts.xlsx"
349
+ elif target == "both":
350
+ excel_filename = "parsed_tables_charts.xlsx"
351
+
352
+ if excel_filename:
353
+ excel_path = out_dir / excel_filename
354
+ if excel_path.exists():
355
+ import pandas as pd
356
+ import html as _html
357
+
358
+ # Read Excel file and create HTML tables
359
+ xl_file = pd.ExcelFile(excel_path)
360
+ html_blocks = []
361
+
362
+ for sheet_name in xl_file.sheet_names:
363
+ df = pd.read_excel(excel_path, sheet_name=sheet_name)
364
+ if not df.empty:
365
+ # Create table with title
366
+ title = f"<h3>{_html.escape(sheet_name)}</h3>"
367
+
368
+ # Convert DataFrame to HTML table
369
+ table_html = df.to_html(
370
+ classes="doc-table",
371
+ table_id=None,
372
+ escape=True,
373
+ index=False,
374
+ na_rep=""
375
+ )
376
+
377
+ html_blocks.append(title + table_html)
378
+
379
+ tables_html = "\n".join(html_blocks)
380
+ except Exception as e:
381
+ # Safely encode error message to handle Unicode characters
382
+ try:
383
+ error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
384
+ print(f"Error building tables HTML: {error_msg}")
385
+ except Exception:
386
+ print(f"Error building tables HTML: <Unicode encoding error>")
387
+ tables_html = ""
388
+
389
+ return (f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}", tables_html, gallery_items, file_paths, zip_path)
390
+
391
+
392
+ THEME = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
393
+
394
+ CUSTOM_CSS = """
395
+ .gradio-container {max-width: 100% !important; padding-left: 24px; padding-right: 24px}
396
+ .container {max-width: 100% !important}
397
+ .app {max-width: 100% !important}
398
+ .header {margin-bottom: 8px}
399
+ .subtitle {color: var(--body-text-color-subdued)}
400
+ .card {border:1px solid var(--border-color); border-radius:12px; padding:8px}
401
+ .status-ok {color: var(--color-success)}
402
+
403
+ /* Page content styling */
404
+ .page-content img {
405
+ max-width: 100% !important;
406
+ height: auto !important;
407
+ display: block !important;
408
+ margin: 10px auto !important;
409
+ border: 1px solid #ddd !important;
410
+ border-radius: 8px !important;
411
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
412
+ }
413
+
414
+ .page-content {
415
+ max-height: none !important;
416
+ overflow: visible !important;
417
+ }
418
+
419
+ /* Table styling */
420
+ .page-content table.doc-table {
421
+ width: 100% !important;
422
+ border-collapse: collapse !important;
423
+ margin: 12px 0 !important;
424
+ }
425
+ .page-content table.doc-table th,
426
+ .page-content table.doc-table td {
427
+ border: 1px solid #e5e7eb !important;
428
+ padding: 8px 10px !important;
429
+ text-align: left !important;
430
+ }
431
+ .page-content table.doc-table thead th {
432
+ background: #f9fafb !important;
433
+ font-weight: 600 !important;
434
+ }
435
+ .page-content table.doc-table tbody tr:nth-child(even) td {
436
+ background: #fafafa !important;
437
+ }
438
+
439
+ /* Clickable image buttons */
440
+ .image-button {
441
+ background: #0066cc !important;
442
+ color: white !important;
443
+ border: none !important;
444
+ padding: 5px 10px !important;
445
+ border-radius: 4px !important;
446
+ cursor: pointer !important;
447
+ margin: 2px !important;
448
+ font-size: 14px !important;
449
+ }
450
+
451
+ .image-button:hover {
452
+ background: #0052a3 !important;
453
+ }
454
+ """
455
+
456
+
457
+ def build_demo() -> gr.Blocks:
458
+ with gr.Blocks(title="Doctra - Document Parser", theme=THEME, css=CUSTOM_CSS) as demo:
459
+ gr.Markdown(
460
+ """
461
+ <div class="header">
462
+ <h2 style="margin:0">Doctra — Document Parser</h2>
463
+ <div class="subtitle">Parse PDFs, extract tables/charts, preview markdown, and download outputs.</div>
464
+ </div>
465
+ """
466
+ )
467
+
468
+
469
+ with gr.Tab("Full Parse"):
470
+ with gr.Row():
471
+ pdf = gr.File(file_types=[".pdf"], label="PDF")
472
+ use_vlm = gr.Checkbox(label="Use VLM (optional)", value=False)
473
+ vlm_provider = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
474
+ vlm_api_key = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
475
+
476
+ with gr.Accordion("Advanced", open=False):
477
+ with gr.Row():
478
+ layout_model = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
479
+ dpi = gr.Slider(100, 400, value=200, step=10, label="DPI")
480
+ min_score = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
481
+ with gr.Row():
482
+ ocr_lang = gr.Textbox(value="eng", label="OCR Language")
483
+ ocr_psm = gr.Slider(0, 13, value=4, step=1, label="Tesseract PSM")
484
+ ocr_oem = gr.Slider(0, 3, value=3, step=1, label="Tesseract OEM")
485
+ with gr.Row():
486
+ ocr_config = gr.Textbox(value="", label="Extra OCR config")
487
+ box_sep = gr.Textbox(value="\n", label="Box separator")
488
+
489
+ run_btn = gr.Button("▶ Run Full Parse", variant="primary")
490
+ status = gr.Textbox(label="Status", elem_classes=["status-ok"])
491
+
492
+ # Page selector for extracted content
493
+ page_selector = gr.Dropdown(label="Select Page to Display", interactive=True, visible=False)
494
+
495
+ # Full Parse components
496
+ with gr.Row():
497
+ with gr.Column():
498
+ md_preview = gr.HTML(label="Extracted Content", visible=True, elem_classes=["page-content"])
499
+ with gr.Column():
500
+ page_image = gr.Image(label="Page image", interactive=False)
501
+ gallery = gr.Gallery(label="Extracted images (tables/charts/figures)", columns=4, height=420, preview=True)
502
+ files_out = gr.Files(label="Download individual output files")
503
+ zip_out = gr.File(label="Download all outputs (ZIP)")
504
+
505
+ # Hidden state to store pages data and all images
506
+ pages_state = gr.State([])
507
+ all_images_state = gr.State([])
508
+ pdf_path_state = gr.State("")
509
+ page_images_state = gr.State([]) # list of file paths per page index (1-based)
510
+
511
+ # Hidden components for image filtering
512
+ filter_trigger = gr.Button(visible=False)
513
+ current_image_path = gr.State("")
514
+ current_image_caption = gr.State("")
515
+ image_filter_input = gr.Textbox(visible=False, elem_id="image_filter_input")
516
+
517
+ def parse_markdown_by_pages(md_content: str):
518
+ """Parse markdown content and organize it by pages."""
519
+ import re
520
+
521
+ pages = []
522
+ current_page = None
523
+
524
+ lines = md_content.split('\n')
525
+ i = 0
526
+
527
+
528
+ # First, let's find all page headers
529
+ page_headers = []
530
+ for i, line in enumerate(lines):
531
+ if line.strip().startswith('## Page '):
532
+ page_num = line.strip().replace('## Page ', '').strip()
533
+ page_headers.append((i, page_num, line))
534
+
535
+
536
+ # Now parse content for each page
537
+ for i, (line_idx, page_num, header_line) in enumerate(page_headers):
538
+ # Find the end of this page (start of next page or end of document)
539
+ start_line = line_idx
540
+ if i + 1 < len(page_headers):
541
+ end_line = page_headers[i + 1][0]
542
+ else:
543
+ end_line = len(lines)
544
+
545
+ # Extract content for this page
546
+ page_content = lines[start_line:end_line]
547
+
548
+ page = {
549
+ 'page_num': page_num,
550
+ 'content': page_content
551
+ }
552
+ pages.append(page)
553
+
554
+ return pages
555
+
556
+ def update_page_selector(pages_data):
557
+ """Update the page selector dropdown with available pages."""
558
+ if not pages_data:
559
+ return gr.Dropdown(choices=[], value=None, visible=False)
560
+
561
+ page_choices = [f"Page {page['page_num']}" for page in pages_data]
562
+ return gr.Dropdown(choices=page_choices, value=page_choices[0], visible=True)
563
+
564
+ def display_selected_page(selected_page, pages_data, pdf_path, page_images):
565
+ """Display the content of the selected page and the rendered page image."""
566
+ if not selected_page or not pages_data:
567
+ return "", None
568
+
569
+
570
+ # Find the selected page
571
+ page_num = selected_page.replace("Page ", "")
572
+ page = next((p for p in pages_data if p['page_num'] == page_num), None)
573
+
574
+ if not page:
575
+ return "Page not found", None
576
+
577
+ # Build HTML with inline base64 images, render markdown tables, and preserve paragraphs/line breaks
578
+ import html as _html, base64, re as _re
579
+ base_dir = None
580
+ try:
581
+ stem = Path(pdf_path).stem if pdf_path else ""
582
+ if stem:
583
+ base_dir = Path("outputs") / stem / "full_parse"
584
+ except Exception:
585
+ base_dir = None
586
+ processed_content = []
587
+ paragraph_buffer = []
588
+ def flush_paragraph():
589
+ nonlocal paragraph_buffer
590
+ if paragraph_buffer:
591
+ joined = '<br/>'.join(_html.escape(l) for l in paragraph_buffer)
592
+ processed_content.append(f'<p>{joined}</p>')
593
+ paragraph_buffer = []
594
+
595
+ # Simple markdown table detection and rendering
596
+ def is_md_table_header(s: str) -> bool:
597
+ return '|' in s and ('---' in s or '—' in s)
598
+
599
+ def render_md_table(lines: List[str]) -> str:
600
+ rows = [l.strip().strip('|').split('|') for l in lines]
601
+ rows = [[_html.escape(c.strip()) for c in r] for r in rows]
602
+ if len(rows) < 2:
603
+ return ""
604
+ header = rows[0]
605
+ body = rows[2:] if len(rows) > 2 else []
606
+ thead = '<thead><tr>' + ''.join(f'<th>{c}</th>' for c in header) + '</tr></thead>'
607
+ tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{c}</td>' for c in r) + '</tr>' for r in body) + '</tbody>'
608
+ return f'<table class="doc-table">{thead}{tbody}</table>'
609
+
610
+ i = 0
611
+ lines = page['content']
612
+ n = len(lines)
613
+ while i < n:
614
+ raw_line = lines[i]
615
+ line = raw_line.rstrip('\r\n')
616
+ stripped = line.strip()
617
+ if stripped.startswith('![') and ('](images/' in stripped or '](images\\' in stripped):
618
+ flush_paragraph()
619
+ match = _re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
620
+ if match and base_dir is not None:
621
+ caption = match.group(1)
622
+ rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
623
+ abs_path = (base_dir / rel_path).resolve()
624
+ try:
625
+ with open(abs_path, 'rb') as f:
626
+ b64 = base64.b64encode(f.read()).decode('ascii')
627
+ processed_content.append(f'<figure><img src="data:image/jpeg;base64,{b64}" alt="{_html.escape(caption)}"/><figcaption>{_html.escape(caption)}</figcaption></figure>')
628
+ except Exception:
629
+ processed_content.append(f'<div>{_html.escape(caption)} (image not found)</div>')
630
+ else:
631
+ paragraph_buffer.append(raw_line)
632
+ i += 1
633
+ continue
634
+
635
+ # Detect markdown table blocks - only if line starts with | or has multiple | characters
636
+ if (stripped.startswith('|') or stripped.count('|') >= 2) and i + 1 < n and is_md_table_header(lines[i + 1]):
637
+ flush_paragraph()
638
+ table_block = [stripped]
639
+ i += 1
640
+ table_block.append(lines[i].strip())
641
+ i += 1
642
+ while i < n:
643
+ nxt = lines[i].rstrip('\r\n')
644
+ if nxt.strip() == '' or (not nxt.strip().startswith('|') and nxt.count('|') < 2):
645
+ break
646
+ table_block.append(nxt.strip())
647
+ i += 1
648
+ html_table = render_md_table(table_block)
649
+ if html_table:
650
+ processed_content.append(html_table)
651
+ else:
652
+ for tl in table_block:
653
+ paragraph_buffer.append(tl)
654
+ continue
655
+
656
+ if stripped.startswith('## '):
657
+ flush_paragraph()
658
+ processed_content.append(f'<h3>{_html.escape(stripped[3:])}</h3>')
659
+ elif stripped.startswith('# '):
660
+ flush_paragraph()
661
+ processed_content.append(f'<h2>{_html.escape(stripped[2:])}</h2>')
662
+ elif stripped == '':
663
+ flush_paragraph()
664
+ processed_content.append('<br/>')
665
+ else:
666
+ paragraph_buffer.append(raw_line)
667
+ i += 1
668
+ flush_paragraph()
669
+
670
+ # Join the processed content lines
671
+ content = "\n".join(processed_content)
672
+
673
+ # Ensure page images are prepared
674
+ try:
675
+ if pdf_path and not page_images:
676
+ from doctra.utils.pdf_io import render_pdf_to_images
677
+ tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
678
+ pil_pages = render_pdf_to_images(pdf_path)
679
+ saved_paths: List[str] = []
680
+ for idx, (im, _, _) in enumerate(pil_pages, start=1):
681
+ out_path = tmp_img_dir / f"page_{idx:03d}.jpg"
682
+ im.save(out_path, format="JPEG", quality=90)
683
+ saved_paths.append(str(out_path))
684
+ page_images = saved_paths
685
+ page_images_state.value = saved_paths # cache
686
+ except Exception as e:
687
+ pass
688
+
689
+ # Select image for the current page number (1-based)
690
+ page_img = None
691
+ try:
692
+ page_index = int(page_num)
693
+ if page_images and 1 <= page_index <= len(page_images):
694
+ page_img = page_images[page_index - 1]
695
+ except Exception:
696
+ page_img = None
697
+
698
+ return content, page_img
699
+
700
+ def filter_gallery_by_image(img_path, caption, all_images):
701
+ """Filter gallery to show only the selected image."""
702
+ if not img_path or not all_images:
703
+ return all_images
704
+
705
+ # Find the selected image
706
+ filtered_images = []
707
+ for stored_img_path, stored_caption in all_images:
708
+ if stored_caption == caption:
709
+ filtered_images.append((stored_img_path, stored_caption))
710
+ break
711
+
712
+ return filtered_images
713
+
714
+ def trigger_image_filter(filter_input):
715
+ """Trigger image filtering when input changes."""
716
+ if not filter_input:
717
+ return "", ""
718
+
719
+ # Parse the input (format: "img_path|caption")
720
+ parts = filter_input.split("|", 1)
721
+ if len(parts) == 2:
722
+ img_path, caption = parts
723
+ return img_path, caption
724
+ return "", ""
725
+
726
+ def filter_gallery_by_trigger(img_path, caption, all_images):
727
+ """Filter gallery based on trigger values."""
728
+ if not img_path or not caption or not all_images:
729
+ return all_images
730
+
731
+ # Find the selected image
732
+ filtered_images = []
733
+ for stored_img_path, stored_caption in all_images:
734
+ if stored_caption == caption:
735
+ filtered_images.append((stored_img_path, stored_caption))
736
+ break
737
+
738
+ return filtered_images
739
+
740
+ def run_full_parse_with_pages(*args):
741
+ """Run full parse and parse the markdown into pages."""
742
+ result = run_full_parse(*args)
743
+ status_msg, md_content, gallery_items, file_paths, zip_path = result
744
+
745
+ # Parse markdown into pages
746
+ pages_data = []
747
+ first_page_content = ""
748
+ all_images = []
749
+ if md_content:
750
+ pages_data = parse_markdown_by_pages(md_content)
751
+
752
+ # Collect all images from all pages
753
+ for page in pages_data:
754
+ for line in page['content']:
755
+ if line.strip().startswith('![') and ('](images/' in line or '](images\\' in line):
756
+ import re
757
+ match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line.strip())
758
+ if match:
759
+ caption = match.group(1)
760
+ img_path = match.group(2)
761
+ all_images.append((img_path, caption))
762
+
763
+
764
+ # Show only Page 1 content initially
765
+ if pages_data:
766
+ first_page = pages_data[0]
767
+ first_page_content = "\n".join(first_page['content'])
768
+
769
+ # Prepare first page image immediately and cache page images
770
+ input_pdf_path = args[0]
771
+ first_page_image = None
772
+ saved_paths: List[str] = []
773
+ try:
774
+ if input_pdf_path:
775
+ from doctra.utils.pdf_io import render_pdf_to_images
776
+ tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
777
+ pil_pages = render_pdf_to_images(input_pdf_path)
778
+ for idx, (im, _, _) in enumerate(pil_pages, start=1):
779
+ out_path = tmp_img_dir / f"page_{idx:03d}.jpg"
780
+ im.save(out_path, format="JPEG", quality=90)
781
+ saved_paths.append(str(out_path))
782
+ if saved_paths:
783
+ first_page_image = saved_paths[0]
784
+ except Exception as e:
785
+ pass
786
+
787
+ # Build initial HTML with inline images and proper blocks for first page
788
+ if pages_data:
789
+ import html as _html, base64, re as _re
790
+ base_dir = None
791
+ try:
792
+ stem = Path(input_pdf_path).stem if input_pdf_path else ""
793
+ if stem:
794
+ base_dir = Path("outputs") / stem / "full_parse"
795
+ except Exception:
796
+ base_dir = None
797
+ html_lines: List[str] = []
798
+ for raw_line in pages_data[0]['content']:
799
+ line = raw_line.strip()
800
+ if line.startswith('![') and ('](images/' in line or '](images\\' in line):
801
+ match = _re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
802
+ if match and base_dir is not None:
803
+ caption = match.group(1)
804
+ rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
805
+ abs_path = (base_dir / rel_path).resolve()
806
+ try:
807
+ with open(abs_path, 'rb') as f:
808
+ b64 = base64.b64encode(f.read()).decode('ascii')
809
+ html_lines.append(f'<figure><img src="data:image/jpeg;base64,{b64}" alt="{_html.escape(caption)}"/><figcaption>{_html.escape(caption)}</figcaption></figure>')
810
+ except Exception:
811
+ html_lines.append(f'<div>{_html.escape(caption)} (image not found)</div>')
812
+ else:
813
+ html_lines.append(f'<p>{_html.escape(raw_line)}</p>')
814
+ else:
815
+ if line.startswith('## '):
816
+ html_lines.append(f'<h3>{_html.escape(line[3:])}</h3>')
817
+ elif line.startswith('# '):
818
+ html_lines.append(f'<h2>{_html.escape(line[2:])}</h2>')
819
+ elif line == '':
820
+ html_lines.append('<br/>')
821
+ else:
822
+ html_lines.append(f'<p>{_html.escape(raw_line)}</p>')
823
+ first_page_content = "\n".join(html_lines)
824
+
825
+ return status_msg, first_page_content, first_page_image, gallery_items, file_paths, zip_path, pages_data, all_images, input_pdf_path, saved_paths
826
+
827
+ run_btn.click(
828
+ fn=run_full_parse_with_pages,
829
+ inputs=[pdf, use_vlm, vlm_provider, vlm_api_key, layout_model, dpi, min_score, ocr_lang, ocr_psm, ocr_oem, ocr_config, box_sep],
830
+ outputs=[status, md_preview, page_image, gallery, files_out, zip_out, pages_state, all_images_state, pdf_path_state, page_images_state],
831
+ ).then(
832
+ fn=update_page_selector,
833
+ inputs=[pages_state],
834
+ outputs=[page_selector],
835
+ )
836
+
837
+ page_selector.change(
838
+ fn=display_selected_page,
839
+ inputs=[page_selector, pages_state, pdf_path_state, page_images_state],
840
+ outputs=[md_preview, page_image],
841
+ )
842
+
843
+ image_filter_input.change(
844
+ fn=trigger_image_filter,
845
+ inputs=[image_filter_input],
846
+ outputs=[current_image_path, current_image_caption],
847
+ ).then(
848
+ fn=filter_gallery_by_trigger,
849
+ inputs=[current_image_path, current_image_caption, all_images_state],
850
+ outputs=[gallery],
851
+ )
852
+
853
+ with gr.Tab("Extract Tables/Charts"):
854
+ with gr.Row():
855
+ pdf_e = gr.File(file_types=[".pdf"], label="PDF")
856
+ target = gr.Dropdown(["tables", "charts", "both"], value="both", label="Target")
857
+ use_vlm_e = gr.Checkbox(label="Use VLM (optional)", value=False)
858
+ vlm_provider_e = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
859
+ vlm_api_key_e = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
860
+ with gr.Accordion("Advanced", open=False):
861
+ with gr.Row():
862
+ layout_model_e = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
863
+ dpi_e = gr.Slider(100, 400, value=200, step=10, label="DPI")
864
+ min_score_e = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
865
+
866
+ run_btn_e = gr.Button("▶ Run Extraction", variant="primary")
867
+ status_e = gr.Textbox(label="Status")
868
+ # Dropdown to select specific item
869
+ item_selector_e = gr.Dropdown(label="Select Item", visible=False, interactive=True)
870
+
871
+ # Display extracted data and images
872
+ with gr.Row():
873
+ tables_preview_e = gr.HTML(label="Extracted Data", elem_classes=["page-content"])
874
+ image_e = gr.Image(label="Selected Image", interactive=False)
875
+
876
+ # Keep gallery for reference but make it smaller
877
+ gallery_e = gr.Gallery(label="All Extracted Images", columns=4, height=200, preview=True)
878
+ files_out_e = gr.Files(label="Download individual output files")
879
+ zip_out_e = gr.File(label="Download all outputs (ZIP)")
880
+
881
+ # State to store output directory
882
+ out_dir_state = gr.State("")
883
+
884
+ def capture_out_dir(status_text):
885
+ if not status_text:
886
+ return ""
887
+ try:
888
+ if "Output directory:" in status_text:
889
+ return status_text.split("Output directory:", 1)[1].strip()
890
+ except Exception:
891
+ pass
892
+ return ""
893
+
894
+ def build_item_selector(out_dir_path, target, use_vlm):
895
+ if not out_dir_path or not use_vlm:
896
+ return gr.Dropdown(choices=[], value=None, visible=False)
897
+
898
+ try:
899
+ out_dir = Path(out_dir_path)
900
+ mapping = out_dir / "vlm_items.json"
901
+ if not mapping.exists():
902
+ return gr.Dropdown(choices=[], value=None, visible=False)
903
+
904
+ import json
905
+ data = json.loads(mapping.read_text(encoding="utf-8"))
906
+ choices = []
907
+
908
+ for entry in data:
909
+ kind = entry.get("kind")
910
+ # Filter based on target
911
+ if target == "both" or (target == "tables" and kind == "table") or (target == "charts" and kind == "chart"):
912
+ title = entry.get("title") or f"{kind.title()}"
913
+ page = entry.get("page")
914
+ rel_path = entry.get("image_rel_path")
915
+ label = f"{title} — Page {page}"
916
+ choices.append((label, rel_path))
917
+
918
+ return gr.Dropdown(choices=choices, value=choices[0][1] if choices else None, visible=bool(choices))
919
+ except Exception:
920
+ return gr.Dropdown(choices=[], value=None, visible=False)
921
+
922
+ def show_selected_item(rel_path, out_dir_path):
923
+ if not rel_path or not out_dir_path:
924
+ return "", None
925
+
926
+ try:
927
+ out_dir = Path(out_dir_path)
928
+ mapping = out_dir / "vlm_items.json"
929
+ if not mapping.exists():
930
+ return "", None
931
+
932
+ import json, html as _html
933
+ data = json.loads(mapping.read_text(encoding="utf-8"))
934
+
935
+ for entry in data:
936
+ if entry.get("image_rel_path") == rel_path:
937
+ headers = entry.get("headers") or []
938
+ rows = entry.get("rows") or []
939
+ title = entry.get("title") or "Data"
940
+ kind = entry.get("kind", "table")
941
+
942
+ # Create HTML table
943
+ if headers and rows:
944
+ thead = '<thead><tr>' + ''.join(f'<th>{_html.escape(str(h))}</th>' for h in headers) + '</tr></thead>'
945
+ tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{_html.escape(str(c))}</td>' for c in r) + '</tr>' for r in rows) + '</tbody>'
946
+ html_table = f'<h3>{_html.escape(title)} ({kind.title()})</h3><table class="doc-table">{thead}{tbody}</table>'
947
+ else:
948
+ html_table = f'<h3>{_html.escape(title)} ({kind.title()})</h3><p>No structured data available</p>'
949
+
950
+ # Get image path
951
+ img_abs = str((out_dir / rel_path).resolve())
952
+ return html_table, img_abs
953
+
954
+ return "", None
955
+ except Exception:
956
+ return "", None
957
+
958
+ run_btn_e.click(
959
+ fn=lambda f, t, a, b, c, d, e, g: run_extract(
960
+ f.name if f else "",
961
+ t,
962
+ a,
963
+ b,
964
+ c,
965
+ d,
966
+ e,
967
+ g,
968
+ ),
969
+ inputs=[pdf_e, target, use_vlm_e, vlm_provider_e, vlm_api_key_e, layout_model_e, dpi_e, min_score_e],
970
+ outputs=[status_e, tables_preview_e, gallery_e, files_out_e, zip_out_e],
971
+ ).then(
972
+ fn=capture_out_dir,
973
+ inputs=[status_e],
974
+ outputs=[out_dir_state]
975
+ ).then(
976
+ fn=build_item_selector,
977
+ inputs=[out_dir_state, target, use_vlm_e],
978
+ outputs=[item_selector_e]
979
+ ).then(
980
+ fn=show_selected_item,
981
+ inputs=[item_selector_e, out_dir_state],
982
+ outputs=[tables_preview_e, image_e]
983
+ )
984
+
985
+ # Handle dropdown selection changes
986
+ item_selector_e.change(
987
+ fn=show_selected_item,
988
+ inputs=[item_selector_e, out_dir_state],
989
+ outputs=[tables_preview_e, image_e]
990
+ )
991
+
992
+
993
+ gr.Markdown(
994
+ """
995
+ <div class="card">
996
+ <b>Tips</b>
997
+ <ul>
998
+ <li>On Spaces, set a secret <code>VLM_API_KEY</code> to enable VLM features.</li>
999
+ <li>Outputs are saved under <code>outputs/&lt;pdf_stem&gt;/</code>.</li>
1000
+ </ul>
1001
+ </div>
1002
+ """
1003
+ )
1004
+
1005
+ return demo
1006
+
1007
+
1008
+ def launch_ui(server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False):
1009
+ demo = build_demo()
1010
+ demo.launch(server_name=server_name, server_port=server_port, share=share)
1011
+
1012
+