doctra 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
doctra/ui/app.py CHANGED
@@ -1,436 +1,38 @@
1
- import os
2
- import shutil
3
- import tempfile
4
- import re
5
- import traceback
6
- import pandas as pd
7
- import html as _html
8
- import base64
9
- import json
10
- from pathlib import Path
11
- from typing import Optional, Tuple, List, Dict, Any
12
-
13
- import gradio as gr
14
-
15
- from doctra.parsers.structured_pdf_parser import StructuredPDFParser
16
- from doctra.parsers.table_chart_extractor import ChartTablePDFParser
17
- from doctra.utils.pdf_io import render_pdf_to_images
18
-
19
-
20
- def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zip_filename: Optional[str] = None, is_structured_parsing: bool = False) -> Tuple[List[tuple[str, str]], List[str], str]:
21
- gallery_items: List[tuple[str, str]] = []
22
- file_paths: List[str] = []
1
+ """
2
+ Main Doctra Gradio Application
23
3
 
24
- if out_dir.exists():
25
- if is_structured_parsing:
26
- for file_path in sorted(out_dir.rglob("*")):
27
- if file_path.is_file():
28
- file_paths.append(str(file_path))
29
- else:
30
- main_files = [
31
- "result.html",
32
- "result.md",
33
- "tables.html",
34
- "tables.xlsx"
35
- ]
36
-
37
- for main_file in main_files:
38
- file_path = out_dir / main_file
39
- if file_path.exists():
40
- file_paths.append(str(file_path))
41
-
42
- if allowed_kinds:
43
- for kind in allowed_kinds:
44
- p = out_dir / kind
45
- if p.exists():
46
- for img in sorted(p.glob("*.png")):
47
- file_paths.append(str(img))
48
-
49
- images_dir = out_dir / "images" / kind
50
- if images_dir.exists():
51
- for img in sorted(images_dir.glob("*.jpg")):
52
- file_paths.append(str(img))
53
- else:
54
- for p in (out_dir / "charts").glob("*.png"):
55
- file_paths.append(str(p))
56
- for p in (out_dir / "tables").glob("*.png"):
57
- file_paths.append(str(p))
58
- for p in (out_dir / "images").rglob("*.jpg"):
59
- file_paths.append(str(p))
4
+ This module serves as the main entry point for the Doctra Gradio interface.
5
+ It imports and composes the modular UI components for a clean, maintainable structure.
60
6
 
61
- if allowed_kinds:
62
- if "charts" in allowed_kinds and "tables" in allowed_kinds:
63
- excel_files = ["parsed_tables_charts.xlsx"]
64
- elif "charts" in allowed_kinds:
65
- excel_files = ["parsed_charts.xlsx"]
66
- elif "tables" in allowed_kinds:
67
- excel_files = ["parsed_tables.xlsx"]
68
- else:
69
- excel_files = []
70
-
71
- for excel_file in excel_files:
72
- excel_path = out_dir / excel_file
73
- if excel_path.exists():
74
- file_paths.append(str(excel_path))
7
+ The application is organized into the following modules:
8
+ - ui_helpers.py: Shared utilities, constants, and helper functions
9
+ - full_parse_ui.py: Full PDF parsing functionality with page navigation
10
+ - tables_charts_ui.py: Table and chart extraction with VLM integration
11
+ - docres_ui.py: Image restoration functionality with before/after comparison
12
+ - enhanced_parser_ui.py: Enhanced PDF parsing with DocRes image restoration
75
13
 
76
- kinds = allowed_kinds if allowed_kinds else ["tables", "charts", "figures"]
77
- for sub in kinds:
78
- p = out_dir / sub
79
- if p.exists():
80
- for img in sorted(p.glob("*.png")):
81
- gallery_items.append((str(img), f"{sub}: {img.name}"))
82
-
83
- images_dir = out_dir / "images" / sub
84
- if images_dir.exists():
85
- for img in sorted(images_dir.glob("*.jpg")):
86
- gallery_items.append((str(img), f"{sub}: {img.name}"))
14
+ Each module is self-contained with its own state management and event handlers,
15
+ making the codebase easier to navigate, test, and extend.
16
+ """
87
17
 
88
- tmp_zip_dir = Path(tempfile.mkdtemp(prefix="doctra_zip_"))
89
-
90
- if zip_filename:
91
- safe_filename = re.sub(r'[<>:"/\\|?*]', '_', zip_filename)
92
- zip_base = tmp_zip_dir / safe_filename
93
- else:
94
- zip_base = tmp_zip_dir / "doctra_outputs"
95
-
96
- filtered_dir = tmp_zip_dir / "filtered_outputs"
97
- shutil.copytree(out_dir, filtered_dir, ignore=shutil.ignore_patterns('~$*', '*.tmp', '*.temp'))
98
-
99
- zip_path = shutil.make_archive(str(zip_base), 'zip', root_dir=str(filtered_dir))
18
+ import gradio as gr
100
19
 
101
- return gallery_items, file_paths, zip_path
20
+ from doctra.ui.ui_helpers import THEME, CUSTOM_CSS, create_tips_markdown
21
+ from doctra.ui.full_parse_ui import create_full_parse_tab
22
+ from doctra.ui.tables_charts_ui import create_tables_charts_tab
23
+ from doctra.ui.docres_ui import create_docres_tab
24
+ from doctra.ui.enhanced_parser_ui import create_enhanced_parser_tab
102
25
 
103
26
 
104
- def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
105
- """
106
- Parse markdown content and organize it by pages.
107
- Returns a list of page dictionaries with content, tables, charts, and figures.
27
+ def build_demo() -> gr.Blocks:
108
28
  """
29
+ Build the main Doctra Gradio interface using modular components.
109
30
 
110
- pages = []
111
- current_page = None
112
-
113
- lines = md_content.split('\n')
114
- i = 0
115
-
116
- while i < len(lines):
117
- line = lines[i].strip()
118
-
119
- if line.startswith('## Page '):
120
- if current_page:
121
- pages.append(current_page)
122
-
123
- page_num = line.replace('## Page ', '').strip()
124
- current_page = {
125
- 'page_num': page_num,
126
- 'content': [],
127
- 'tables': [],
128
- 'charts': [],
129
- 'figures': [],
130
- 'images': [],
131
- 'full_content': [] # Store full content with inline images
132
- }
133
- i += 1
134
- continue
135
-
136
- if line.startswith('![') and '](images/' in line:
137
- match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
138
- if match:
139
- caption = match.group(1)
140
- img_path = match.group(2)
141
-
142
- if 'Table' in caption:
143
- current_page['tables'].append({'caption': caption, 'path': img_path})
144
- elif 'Chart' in caption:
145
- current_page['charts'].append({'caption': caption, 'path': img_path})
146
- elif 'Figure' in caption:
147
- current_page['figures'].append({'caption': caption, 'path': img_path})
148
-
149
- current_page['images'].append({'caption': caption, 'path': img_path})
150
-
151
- current_page['full_content'].append(f"![{caption}]({img_path})")
152
-
153
- elif current_page:
154
- if line:
155
- current_page['content'].append(line)
156
- current_page['full_content'].append(line)
157
-
158
- i += 1
159
-
160
- if current_page:
161
- pages.append(current_page)
162
-
163
- return pages
164
-
165
-
166
- def run_full_parse(
167
- pdf_file: str,
168
- use_vlm: bool,
169
- vlm_provider: str,
170
- vlm_api_key: str,
171
- layout_model_name: str,
172
- dpi: int,
173
- min_score: float,
174
- ocr_lang: str,
175
- ocr_psm: int,
176
- ocr_oem: int,
177
- ocr_extra_config: str,
178
- box_separator: str,
179
- ) -> Tuple[str, Optional[str], List[tuple[str, str]], List[str], str]:
180
- if not pdf_file:
181
- return ("No file provided.", None, [], [], "")
182
-
183
- original_filename = Path(pdf_file).stem
184
-
185
- tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
186
- input_pdf = tmp_dir / f"{original_filename}.pdf"
187
- shutil.copy2(pdf_file, input_pdf)
188
-
189
- # Validate VLM configuration
190
- if use_vlm and not vlm_api_key:
191
- return ("❌ Error: VLM API key is required when using VLM", None, [], [], "")
192
-
193
- if use_vlm and vlm_api_key:
194
- # Basic API key validation
195
- if len(vlm_api_key.strip()) < 10:
196
- return ("❌ Error: VLM API key appears to be too short or invalid", None, [], [], "")
197
- if vlm_api_key.strip().startswith('sk-') and len(vlm_api_key.strip()) < 20:
198
- return ("❌ Error: OpenAI API key appears to be invalid (too short)", None, [], [], "")
199
-
200
- parser = StructuredPDFParser(
201
- use_vlm=use_vlm,
202
- vlm_provider=vlm_provider,
203
- vlm_api_key=vlm_api_key or None,
204
- layout_model_name=layout_model_name,
205
- dpi=int(dpi),
206
- min_score=float(min_score),
207
- ocr_lang=ocr_lang,
208
- ocr_psm=int(ocr_psm),
209
- ocr_oem=int(ocr_oem),
210
- ocr_extra_config=ocr_extra_config or "",
211
- box_separator=box_separator or "\n",
212
- )
213
-
214
- try:
215
- parser.parse(str(input_pdf))
216
- except Exception as e:
217
- traceback.print_exc()
218
- # Safely encode error message for return value
219
- try:
220
- error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
221
- return (f"❌ VLM processing failed: {error_msg}", None, [], [], "")
222
- except Exception:
223
- return (f"❌ VLM processing failed: <Unicode encoding error>", None, [], [], "")
224
-
225
- outputs_root = Path("outputs")
226
- out_dir = outputs_root / original_filename / "full_parse"
227
- if not out_dir.exists():
228
- # fallback: search latest created dir under outputs
229
- candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
230
- if candidates:
231
- out_dir = candidates[0] / "full_parse"
232
- else:
233
- out_dir = outputs_root
234
-
235
- md_file = next(out_dir.glob("*.md"), None)
236
- md_preview = None
237
- if md_file and md_file.exists():
238
- try:
239
- with md_file.open("r", encoding="utf-8", errors="ignore") as f:
240
- md_preview = f.read() # Return the full markdown content
241
- except Exception:
242
- md_preview = None
243
-
244
- gallery_items, file_paths, zip_path = _gather_outputs(out_dir, zip_filename=original_filename, is_structured_parsing=False)
245
- return (f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}", md_preview, gallery_items, file_paths, zip_path)
246
-
247
-
248
- def run_extract(
249
- pdf_file: str,
250
- target: str,
251
- use_vlm: bool,
252
- vlm_provider: str,
253
- vlm_api_key: str,
254
- layout_model_name: str,
255
- dpi: int,
256
- min_score: float,
257
- ) -> Tuple[str, str, List[tuple[str, str]], List[str], str]:
258
- if not pdf_file:
259
- return ("No file provided.", "", [], [], "")
260
-
261
- # Validate VLM configuration
262
- if use_vlm and not vlm_api_key:
263
- return ("❌ Error: VLM API key is required when using VLM", "", [], [], "")
264
-
265
- if use_vlm and vlm_api_key:
266
- # Basic API key validation
267
- if len(vlm_api_key.strip()) < 10:
268
- return ("❌ Error: VLM API key appears to be too short or invalid", "", [], [], "")
269
- if vlm_api_key.strip().startswith('sk-') and len(vlm_api_key.strip()) < 20:
270
- return ("❌ Error: OpenAI API key appears to be invalid (too short)", "", [], [], "")
271
-
272
- # Extract filename from the uploaded file path
273
- original_filename = Path(pdf_file).stem
274
-
275
- tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
276
- input_pdf = tmp_dir / f"{original_filename}.pdf"
277
- shutil.copy2(pdf_file, input_pdf)
278
-
279
- parser = ChartTablePDFParser(
280
- extract_charts=(target in ("charts", "both")),
281
- extract_tables=(target in ("tables", "both")),
282
- use_vlm=use_vlm,
283
- vlm_provider=vlm_provider,
284
- vlm_api_key=vlm_api_key or None,
285
- layout_model_name=layout_model_name,
286
- dpi=int(dpi),
287
- min_score=float(min_score),
288
- )
289
-
290
- output_base = Path("outputs")
291
- parser.parse(str(input_pdf), str(output_base))
292
-
293
- outputs_root = output_base
294
- out_dir = outputs_root / original_filename / "structured_parsing"
295
- if not out_dir.exists():
296
- if outputs_root.exists():
297
- candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
298
- if candidates:
299
- out_dir = candidates[0] / "structured_parsing"
300
- else:
301
- out_dir = outputs_root
302
- else:
303
- outputs_root.mkdir(parents=True, exist_ok=True)
304
- out_dir = outputs_root
305
-
306
- # Determine which kinds to include in outputs based on target selection
307
- allowed_kinds: Optional[List[str]] = None
308
- if target in ("tables", "charts"):
309
- allowed_kinds = [target]
310
- elif target == "both":
311
- allowed_kinds = ["tables", "charts"]
312
-
313
- gallery_items, file_paths, zip_path = _gather_outputs(out_dir, allowed_kinds, zip_filename=original_filename, is_structured_parsing=True)
314
-
315
- # Build tables HTML preview from Excel data (when VLM enabled)
316
- tables_html = ""
317
- try:
318
- if use_vlm:
319
- # Find Excel file based on target
320
- excel_filename = None
321
- if target in ("tables", "charts"):
322
- if target == "tables":
323
- excel_filename = "parsed_tables.xlsx"
324
- else: # charts
325
- excel_filename = "parsed_charts.xlsx"
326
- elif target == "both":
327
- excel_filename = "parsed_tables_charts.xlsx"
328
-
329
- if excel_filename:
330
- excel_path = out_dir / excel_filename
331
- if excel_path.exists():
332
-
333
- # Read Excel file and create HTML tables
334
- xl_file = pd.ExcelFile(excel_path)
335
- html_blocks = []
336
-
337
- for sheet_name in xl_file.sheet_names:
338
- df = pd.read_excel(excel_path, sheet_name=sheet_name)
339
- if not df.empty:
340
- # Create table with title
341
- title = f"<h3>{_html.escape(sheet_name)}</h3>"
342
-
343
- # Convert DataFrame to HTML table
344
- table_html = df.to_html(
345
- classes="doc-table",
346
- table_id=None,
347
- escape=True,
348
- index=False,
349
- na_rep=""
350
- )
351
-
352
- html_blocks.append(title + table_html)
353
-
354
- tables_html = "\n".join(html_blocks)
355
- except Exception as e:
356
- # Safely encode error message to handle Unicode characters
357
- try:
358
- error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
359
- print(f"Error building tables HTML: {error_msg}")
360
- except Exception:
361
- print(f"Error building tables HTML: <Unicode encoding error>")
362
- tables_html = ""
363
-
364
- return (f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}", tables_html, gallery_items, file_paths, zip_path)
365
-
366
-
367
- THEME = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
368
-
369
- CUSTOM_CSS = """
370
- .gradio-container {max-width: 100% !important; padding-left: 24px; padding-right: 24px}
371
- .container {max-width: 100% !important}
372
- .app {max-width: 100% !important}
373
- .header {margin-bottom: 8px}
374
- .subtitle {color: var(--body-text-color-subdued)}
375
- .card {border:1px solid var(--border-color); border-radius:12px; padding:8px}
376
- .status-ok {color: var(--color-success)}
377
-
378
- /* Page content styling */
379
- .page-content img {
380
- max-width: 100% !important;
381
- height: auto !important;
382
- display: block !important;
383
- margin: 10px auto !important;
384
- border: 1px solid #ddd !important;
385
- border-radius: 8px !important;
386
- box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
387
- }
388
-
389
- .page-content {
390
- max-height: none !important;
391
- overflow: visible !important;
392
- }
393
-
394
- /* Table styling */
395
- .page-content table.doc-table {
396
- width: 100% !important;
397
- border-collapse: collapse !important;
398
- margin: 12px 0 !important;
399
- }
400
- .page-content table.doc-table th,
401
- .page-content table.doc-table td {
402
- border: 1px solid #e5e7eb !important;
403
- padding: 8px 10px !important;
404
- text-align: left !important;
405
- }
406
- .page-content table.doc-table thead th {
407
- background: #f9fafb !important;
408
- font-weight: 600 !important;
409
- }
410
- .page-content table.doc-table tbody tr:nth-child(even) td {
411
- background: #fafafa !important;
412
- }
413
-
414
- /* Clickable image buttons */
415
- .image-button {
416
- background: #0066cc !important;
417
- color: white !important;
418
- border: none !important;
419
- padding: 5px 10px !important;
420
- border-radius: 4px !important;
421
- cursor: pointer !important;
422
- margin: 2px !important;
423
- font-size: 14px !important;
424
- }
425
-
426
- .image-button:hover {
427
- background: #0052a3 !important;
428
- }
429
- """
430
-
431
-
432
- def build_demo() -> gr.Blocks:
31
+ Returns:
32
+ Configured Gradio Blocks interface
33
+ """
433
34
  with gr.Blocks(title="Doctra - Document Parser", theme=THEME, css=CUSTOM_CSS) as demo:
35
+ # Header section
434
36
  gr.Markdown(
435
37
  """
436
38
  <div class="header">
@@ -440,540 +42,23 @@ def build_demo() -> gr.Blocks:
440
42
  """
441
43
  )
442
44
 
45
+ # Create modular tabs
46
+ full_parse_tab, full_parse_state = create_full_parse_tab()
47
+ tables_charts_tab, tables_charts_state = create_tables_charts_tab()
48
+ docres_tab, docres_state = create_docres_tab()
49
+ enhanced_parser_tab, enhanced_parser_state = create_enhanced_parser_tab()
443
50
 
444
- with gr.Tab("Full Parse"):
445
- with gr.Row():
446
- pdf = gr.File(file_types=[".pdf"], label="PDF")
447
- use_vlm = gr.Checkbox(label="Use VLM (optional)", value=False)
448
- vlm_provider = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
449
- vlm_api_key = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
450
-
451
- with gr.Accordion("Advanced", open=False):
452
- with gr.Row():
453
- layout_model = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
454
- dpi = gr.Slider(100, 400, value=200, step=10, label="DPI")
455
- min_score = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
456
- with gr.Row():
457
- ocr_lang = gr.Textbox(value="eng", label="OCR Language")
458
- ocr_psm = gr.Slider(0, 13, value=4, step=1, label="Tesseract PSM")
459
- ocr_oem = gr.Slider(0, 3, value=3, step=1, label="Tesseract OEM")
460
- with gr.Row():
461
- ocr_config = gr.Textbox(value="", label="Extra OCR config")
462
- box_sep = gr.Textbox(value="\n", label="Box separator")
463
-
464
- run_btn = gr.Button("▶ Run Full Parse", variant="primary")
465
- status = gr.Textbox(label="Status", elem_classes=["status-ok"])
466
-
467
- # Page selector for extracted content
468
- page_selector = gr.Dropdown(label="Select Page to Display", interactive=True, visible=False)
469
-
470
- # Full Parse components
471
- with gr.Row():
472
- with gr.Column():
473
- md_preview = gr.HTML(label="Extracted Content", visible=True, elem_classes=["page-content"])
474
- with gr.Column():
475
- page_image = gr.Image(label="Page image", interactive=False)
476
- gallery = gr.Gallery(label="Extracted images (tables/charts/figures)", columns=4, height=420, preview=True)
477
- files_out = gr.Files(label="Download individual output files")
478
- zip_out = gr.File(label="Download all outputs (ZIP)")
479
-
480
- # Hidden state to store pages data and all images
481
- pages_state = gr.State([])
482
- all_images_state = gr.State([])
483
- pdf_path_state = gr.State("")
484
- page_images_state = gr.State([]) # list of file paths per page index (1-based)
485
-
486
- # Hidden components for image filtering
487
- filter_trigger = gr.Button(visible=False)
488
- current_image_path = gr.State("")
489
- current_image_caption = gr.State("")
490
- image_filter_input = gr.Textbox(visible=False, elem_id="image_filter_input")
491
-
492
- def parse_markdown_by_pages(md_content: str):
493
- """Parse markdown content and organize it by pages."""
494
-
495
- pages = []
496
- current_page = None
497
-
498
- lines = md_content.split('\n')
499
- i = 0
500
-
501
-
502
- # First, let's find all page headers
503
- page_headers = []
504
- for i, line in enumerate(lines):
505
- if line.strip().startswith('## Page '):
506
- page_num = line.strip().replace('## Page ', '').strip()
507
- page_headers.append((i, page_num, line))
508
-
509
-
510
- # Now parse content for each page
511
- for i, (line_idx, page_num, header_line) in enumerate(page_headers):
512
- # Find the end of this page (start of next page or end of document)
513
- start_line = line_idx
514
- if i + 1 < len(page_headers):
515
- end_line = page_headers[i + 1][0]
516
- else:
517
- end_line = len(lines)
518
-
519
- # Extract content for this page
520
- page_content = lines[start_line:end_line]
521
-
522
- page = {
523
- 'page_num': page_num,
524
- 'content': page_content
525
- }
526
- pages.append(page)
527
-
528
- return pages
529
-
530
- def update_page_selector(pages_data):
531
- """Update the page selector dropdown with available pages."""
532
- if not pages_data:
533
- return gr.Dropdown(choices=[], value=None, visible=False)
534
-
535
- page_choices = [f"Page {page['page_num']}" for page in pages_data]
536
- return gr.Dropdown(choices=page_choices, value=page_choices[0], visible=True)
537
-
538
- def display_selected_page(selected_page, pages_data, pdf_path, page_images):
539
- """Display the content of the selected page and the rendered page image."""
540
- if not selected_page or not pages_data:
541
- return "", None
542
-
543
-
544
- # Find the selected page
545
- page_num = selected_page.replace("Page ", "")
546
- page = next((p for p in pages_data if p['page_num'] == page_num), None)
547
-
548
- if not page:
549
- return "Page not found", None
550
-
551
- # Build HTML with inline base64 images, render markdown tables, and preserve paragraphs/line breaks
552
- base_dir = None
553
- try:
554
- stem = Path(pdf_path).stem if pdf_path else ""
555
- if stem:
556
- base_dir = Path("outputs") / stem / "full_parse"
557
- except Exception:
558
- base_dir = None
559
- processed_content = []
560
- paragraph_buffer = []
561
- def flush_paragraph():
562
- nonlocal paragraph_buffer
563
- if paragraph_buffer:
564
- joined = '<br/>'.join(_html.escape(l) for l in paragraph_buffer)
565
- processed_content.append(f'<p>{joined}</p>')
566
- paragraph_buffer = []
567
-
568
- # Simple markdown table detection and rendering
569
- def is_md_table_header(s: str) -> bool:
570
- return '|' in s and ('---' in s or '—' in s)
571
-
572
- def render_md_table(lines: List[str]) -> str:
573
- rows = [l.strip().strip('|').split('|') for l in lines]
574
- rows = [[_html.escape(c.strip()) for c in r] for r in rows]
575
- if len(rows) < 2:
576
- return ""
577
- header = rows[0]
578
- body = rows[2:] if len(rows) > 2 else []
579
- thead = '<thead><tr>' + ''.join(f'<th>{c}</th>' for c in header) + '</tr></thead>'
580
- tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{c}</td>' for c in r) + '</tr>' for r in body) + '</tbody>'
581
- return f'<table class="doc-table">{thead}{tbody}</table>'
582
-
583
- i = 0
584
- lines = page['content']
585
- n = len(lines)
586
- while i < n:
587
- raw_line = lines[i]
588
- line = raw_line.rstrip('\r\n')
589
- stripped = line.strip()
590
- if stripped.startswith('![') and ('](images/' in stripped or '](images\\' in stripped):
591
- flush_paragraph()
592
- match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
593
- if match and base_dir is not None:
594
- caption = match.group(1)
595
- rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
596
- abs_path = (base_dir / rel_path).resolve()
597
- try:
598
- with open(abs_path, 'rb') as f:
599
- b64 = base64.b64encode(f.read()).decode('ascii')
600
- processed_content.append(f'<figure><img src="data:image/jpeg;base64,{b64}" alt="{_html.escape(caption)}"/><figcaption>{_html.escape(caption)}</figcaption></figure>')
601
- except Exception:
602
- processed_content.append(f'<div>{_html.escape(caption)} (image not found)</div>')
603
- else:
604
- paragraph_buffer.append(raw_line)
605
- i += 1
606
- continue
607
-
608
- # Detect markdown table blocks - only if line starts with | or has multiple | characters
609
- if (stripped.startswith('|') or stripped.count('|') >= 2) and i + 1 < n and is_md_table_header(lines[i + 1]):
610
- flush_paragraph()
611
- table_block = [stripped]
612
- i += 1
613
- table_block.append(lines[i].strip())
614
- i += 1
615
- while i < n:
616
- nxt = lines[i].rstrip('\r\n')
617
- if nxt.strip() == '' or (not nxt.strip().startswith('|') and nxt.count('|') < 2):
618
- break
619
- table_block.append(nxt.strip())
620
- i += 1
621
- html_table = render_md_table(table_block)
622
- if html_table:
623
- processed_content.append(html_table)
624
- else:
625
- for tl in table_block:
626
- paragraph_buffer.append(tl)
627
- continue
628
-
629
- if stripped.startswith('## '):
630
- flush_paragraph()
631
- processed_content.append(f'<h3>{_html.escape(stripped[3:])}</h3>')
632
- elif stripped.startswith('# '):
633
- flush_paragraph()
634
- processed_content.append(f'<h2>{_html.escape(stripped[2:])}</h2>')
635
- elif stripped == '':
636
- flush_paragraph()
637
- processed_content.append('<br/>')
638
- else:
639
- paragraph_buffer.append(raw_line)
640
- i += 1
641
- flush_paragraph()
642
-
643
- # Join the processed content lines
644
- content = "\n".join(processed_content)
645
-
646
- # Ensure page images are prepared
647
- try:
648
- if pdf_path and not page_images:
649
- tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
650
- pil_pages = render_pdf_to_images(pdf_path)
651
- saved_paths: List[str] = []
652
- for idx, (im, _, _) in enumerate(pil_pages, start=1):
653
- out_path = tmp_img_dir / f"page_{idx:03d}.jpg"
654
- im.save(out_path, format="JPEG", quality=90)
655
- saved_paths.append(str(out_path))
656
- page_images = saved_paths
657
- page_images_state.value = saved_paths # cache
658
- except Exception as e:
659
- pass
660
-
661
- # Select image for the current page number (1-based)
662
- page_img = None
663
- try:
664
- page_index = int(page_num)
665
- if page_images and 1 <= page_index <= len(page_images):
666
- page_img = page_images[page_index - 1]
667
- except Exception:
668
- page_img = None
669
-
670
- return content, page_img
671
-
672
- def filter_gallery_by_image(img_path, caption, all_images):
673
- """Filter gallery to show only the selected image."""
674
- if not img_path or not all_images:
675
- return all_images
676
-
677
- # Find the selected image
678
- filtered_images = []
679
- for stored_img_path, stored_caption in all_images:
680
- if stored_caption == caption:
681
- filtered_images.append((stored_img_path, stored_caption))
682
- break
683
-
684
- return filtered_images
685
-
686
- def trigger_image_filter(filter_input):
687
- """Trigger image filtering when input changes."""
688
- if not filter_input:
689
- return "", ""
690
-
691
- # Parse the input (format: "img_path|caption")
692
- parts = filter_input.split("|", 1)
693
- if len(parts) == 2:
694
- img_path, caption = parts
695
- return img_path, caption
696
- return "", ""
697
-
698
- def filter_gallery_by_trigger(img_path, caption, all_images):
699
- """Filter gallery based on trigger values."""
700
- if not img_path or not caption or not all_images:
701
- return all_images
702
-
703
- # Find the selected image
704
- filtered_images = []
705
- for stored_img_path, stored_caption in all_images:
706
- if stored_caption == caption:
707
- filtered_images.append((stored_img_path, stored_caption))
708
- break
709
-
710
- return filtered_images
711
-
712
- def run_full_parse_with_pages(*args):
713
- """Run full parse and parse the markdown into pages."""
714
- result = run_full_parse(*args)
715
- status_msg, md_content, gallery_items, file_paths, zip_path = result
716
-
717
- # Parse markdown into pages
718
- pages_data = []
719
- first_page_content = ""
720
- all_images = []
721
- if md_content:
722
- pages_data = parse_markdown_by_pages(md_content)
723
-
724
- # Collect all images from all pages
725
- for page in pages_data:
726
- for line in page['content']:
727
- if line.strip().startswith('![') and ('](images/' in line or '](images\\' in line):
728
- match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line.strip())
729
- if match:
730
- caption = match.group(1)
731
- img_path = match.group(2)
732
- all_images.append((img_path, caption))
733
-
734
-
735
- # Show only Page 1 content initially
736
- if pages_data:
737
- first_page = pages_data[0]
738
- first_page_content = "\n".join(first_page['content'])
739
-
740
- # Prepare first page image immediately and cache page images
741
- input_pdf_path = args[0]
742
- first_page_image = None
743
- saved_paths: List[str] = []
744
- try:
745
- if input_pdf_path:
746
- tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
747
- pil_pages = render_pdf_to_images(input_pdf_path)
748
- for idx, (im, _, _) in enumerate(pil_pages, start=1):
749
- out_path = tmp_img_dir / f"page_{idx:03d}.jpg"
750
- im.save(out_path, format="JPEG", quality=90)
751
- saved_paths.append(str(out_path))
752
- if saved_paths:
753
- first_page_image = saved_paths[0]
754
- except Exception as e:
755
- pass
756
-
757
- # Build initial HTML with inline images and proper blocks for first page
758
- if pages_data:
759
- base_dir = None
760
- try:
761
- stem = Path(input_pdf_path).stem if input_pdf_path else ""
762
- if stem:
763
- base_dir = Path("outputs") / stem / "full_parse"
764
- except Exception:
765
- base_dir = None
766
- html_lines: List[str] = []
767
- for raw_line in pages_data[0]['content']:
768
- line = raw_line.strip()
769
- if line.startswith('![') and ('](images/' in line or '](images\\' in line):
770
- match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
771
- if match and base_dir is not None:
772
- caption = match.group(1)
773
- rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
774
- abs_path = (base_dir / rel_path).resolve()
775
- try:
776
- with open(abs_path, 'rb') as f:
777
- b64 = base64.b64encode(f.read()).decode('ascii')
778
- html_lines.append(f'<figure><img src="data:image/jpeg;base64,{b64}" alt="{_html.escape(caption)}"/><figcaption>{_html.escape(caption)}</figcaption></figure>')
779
- except Exception:
780
- html_lines.append(f'<div>{_html.escape(caption)} (image not found)</div>')
781
- else:
782
- html_lines.append(f'<p>{_html.escape(raw_line)}</p>')
783
- else:
784
- if line.startswith('## '):
785
- html_lines.append(f'<h3>{_html.escape(line[3:])}</h3>')
786
- elif line.startswith('# '):
787
- html_lines.append(f'<h2>{_html.escape(line[2:])}</h2>')
788
- elif line == '':
789
- html_lines.append('<br/>')
790
- else:
791
- html_lines.append(f'<p>{_html.escape(raw_line)}</p>')
792
- first_page_content = "\n".join(html_lines)
793
-
794
- return status_msg, first_page_content, first_page_image, gallery_items, file_paths, zip_path, pages_data, all_images, input_pdf_path, saved_paths
795
-
796
- run_btn.click(
797
- fn=run_full_parse_with_pages,
798
- inputs=[pdf, use_vlm, vlm_provider, vlm_api_key, layout_model, dpi, min_score, ocr_lang, ocr_psm, ocr_oem, ocr_config, box_sep],
799
- outputs=[status, md_preview, page_image, gallery, files_out, zip_out, pages_state, all_images_state, pdf_path_state, page_images_state],
800
- ).then(
801
- fn=update_page_selector,
802
- inputs=[pages_state],
803
- outputs=[page_selector],
804
- )
805
-
806
- page_selector.change(
807
- fn=display_selected_page,
808
- inputs=[page_selector, pages_state, pdf_path_state, page_images_state],
809
- outputs=[md_preview, page_image],
810
- )
811
-
812
- image_filter_input.change(
813
- fn=trigger_image_filter,
814
- inputs=[image_filter_input],
815
- outputs=[current_image_path, current_image_caption],
816
- ).then(
817
- fn=filter_gallery_by_trigger,
818
- inputs=[current_image_path, current_image_caption, all_images_state],
819
- outputs=[gallery],
820
- )
821
-
822
- with gr.Tab("Extract Tables/Charts"):
823
- with gr.Row():
824
- pdf_e = gr.File(file_types=[".pdf"], label="PDF")
825
- target = gr.Dropdown(["tables", "charts", "both"], value="both", label="Target")
826
- use_vlm_e = gr.Checkbox(label="Use VLM (optional)", value=False)
827
- vlm_provider_e = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
828
- vlm_api_key_e = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
829
- with gr.Accordion("Advanced", open=False):
830
- with gr.Row():
831
- layout_model_e = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
832
- dpi_e = gr.Slider(100, 400, value=200, step=10, label="DPI")
833
- min_score_e = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
834
-
835
- run_btn_e = gr.Button("▶ Run Extraction", variant="primary")
836
- status_e = gr.Textbox(label="Status")
837
- # Dropdown to select specific item
838
- item_selector_e = gr.Dropdown(label="Select Item", visible=False, interactive=True)
839
-
840
- # Display extracted data and images
841
- with gr.Row():
842
- tables_preview_e = gr.HTML(label="Extracted Data", elem_classes=["page-content"])
843
- image_e = gr.Image(label="Selected Image", interactive=False)
844
-
845
- # Keep gallery for reference but make it smaller
846
- gallery_e = gr.Gallery(label="All Extracted Images", columns=4, height=200, preview=True)
847
- files_out_e = gr.Files(label="Download individual output files")
848
- zip_out_e = gr.File(label="Download all outputs (ZIP)")
849
-
850
- # State to store output directory
851
- out_dir_state = gr.State("")
852
-
853
- def capture_out_dir(status_text):
854
- if not status_text:
855
- return ""
856
- try:
857
- if "Output directory:" in status_text:
858
- return status_text.split("Output directory:", 1)[1].strip()
859
- except Exception:
860
- pass
861
- return ""
862
-
863
- def build_item_selector(out_dir_path, target, use_vlm):
864
- if not out_dir_path or not use_vlm:
865
- return gr.Dropdown(choices=[], value=None, visible=False)
866
-
867
- try:
868
- out_dir = Path(out_dir_path)
869
- mapping = out_dir / "vlm_items.json"
870
- if not mapping.exists():
871
- return gr.Dropdown(choices=[], value=None, visible=False)
872
-
873
- data = json.loads(mapping.read_text(encoding="utf-8"))
874
- choices = []
875
-
876
- for entry in data:
877
- kind = entry.get("kind")
878
- # Filter based on target
879
- if target == "both" or (target == "tables" and kind == "table") or (target == "charts" and kind == "chart"):
880
- title = entry.get("title") or f"{kind.title()}"
881
- page = entry.get("page")
882
- rel_path = entry.get("image_rel_path")
883
- label = f"{title} — Page {page}"
884
- choices.append((label, rel_path))
885
-
886
- return gr.Dropdown(choices=choices, value=choices[0][1] if choices else None, visible=bool(choices))
887
- except Exception:
888
- return gr.Dropdown(choices=[], value=None, visible=False)
889
-
890
- def show_selected_item(rel_path, out_dir_path):
891
- if not rel_path or not out_dir_path:
892
- return "", None
893
-
894
- try:
895
- out_dir = Path(out_dir_path)
896
- mapping = out_dir / "vlm_items.json"
897
- if not mapping.exists():
898
- return "", None
899
-
900
- data = json.loads(mapping.read_text(encoding="utf-8"))
901
-
902
- for entry in data:
903
- if entry.get("image_rel_path") == rel_path:
904
- headers = entry.get("headers") or []
905
- rows = entry.get("rows") or []
906
- title = entry.get("title") or "Data"
907
- kind = entry.get("kind", "table")
908
-
909
- # Create HTML table
910
- if headers and rows:
911
- thead = '<thead><tr>' + ''.join(f'<th>{_html.escape(str(h))}</th>' for h in headers) + '</tr></thead>'
912
- tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{_html.escape(str(c))}</td>' for c in r) + '</tr>' for r in rows) + '</tbody>'
913
- html_table = f'<h3>{_html.escape(title)} ({kind.title()})</h3><table class="doc-table">{thead}{tbody}</table>'
914
- else:
915
- html_table = f'<h3>{_html.escape(title)} ({kind.title()})</h3><p>No structured data available</p>'
916
-
917
- # Get image path
918
- img_abs = str((out_dir / rel_path).resolve())
919
- return html_table, img_abs
920
-
921
- return "", None
922
- except Exception:
923
- return "", None
924
-
925
- run_btn_e.click(
926
- fn=lambda f, t, a, b, c, d, e, g: run_extract(
927
- f.name if f else "",
928
- t,
929
- a,
930
- b,
931
- c,
932
- d,
933
- e,
934
- g,
935
- ),
936
- inputs=[pdf_e, target, use_vlm_e, vlm_provider_e, vlm_api_key_e, layout_model_e, dpi_e, min_score_e],
937
- outputs=[status_e, tables_preview_e, gallery_e, files_out_e, zip_out_e],
938
- ).then(
939
- fn=capture_out_dir,
940
- inputs=[status_e],
941
- outputs=[out_dir_state]
942
- ).then(
943
- fn=build_item_selector,
944
- inputs=[out_dir_state, target, use_vlm_e],
945
- outputs=[item_selector_e]
946
- ).then(
947
- fn=show_selected_item,
948
- inputs=[item_selector_e, out_dir_state],
949
- outputs=[tables_preview_e, image_e]
950
- )
951
-
952
- # Handle dropdown selection changes
953
- item_selector_e.change(
954
- fn=show_selected_item,
955
- inputs=[item_selector_e, out_dir_state],
956
- outputs=[tables_preview_e, image_e]
957
- )
958
-
959
-
960
- gr.Markdown(
961
- """
962
- <div class="card">
963
- <b>Tips</b>
964
- <ul>
965
- <li>On Spaces, set a secret <code>VLM_API_KEY</code> to enable VLM features.</li>
966
- <li>Outputs are saved under <code>outputs/&lt;pdf_stem&gt;/</code>.</li>
967
- </ul>
968
- </div>
969
- """
970
- )
51
+ # Tips section
52
+ gr.Markdown(create_tips_markdown())
971
53
 
972
54
  return demo
973
55
 
974
56
 
975
- def launch_ui(server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False):
57
+ def launch_ui():
58
+ """
59
+ Launch the Doctra Gradio interface.
60
+
61
+ This function creates and launches the main application interface.
62
+ """
976
63
  demo = build_demo()
977
- demo.launch(server_name=server_name, server_port=server_port, share=share)
978
-
979
-
64
+ demo.launch()