doctra 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
doctra/ui/app.py CHANGED
@@ -1,434 +1,38 @@
1
- import os
2
- import shutil
3
- import tempfile
4
- import re
5
- from pathlib import Path
6
- from typing import Optional, Tuple, List, Dict, Any
7
-
8
- import gradio as gr
9
-
10
- from doctra.parsers.structured_pdf_parser import StructuredPDFParser
11
- from doctra.parsers.table_chart_extractor import ChartTablePDFParser
12
-
13
-
14
- def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zip_filename: Optional[str] = None, is_structured_parsing: bool = False) -> Tuple[List[tuple[str, str]], List[str], str]:
15
- gallery_items: List[tuple[str, str]] = []
16
- file_paths: List[str] = []
1
+ """
2
+ Main Doctra Gradio Application
17
3
 
18
- if out_dir.exists():
19
- if is_structured_parsing:
20
- for file_path in sorted(out_dir.rglob("*")):
21
- if file_path.is_file():
22
- file_paths.append(str(file_path))
23
- else:
24
- main_files = [
25
- "result.html",
26
- "result.md",
27
- "tables.html",
28
- "tables.xlsx"
29
- ]
30
-
31
- for main_file in main_files:
32
- file_path = out_dir / main_file
33
- if file_path.exists():
34
- file_paths.append(str(file_path))
35
-
36
- if allowed_kinds:
37
- for kind in allowed_kinds:
38
- p = out_dir / kind
39
- if p.exists():
40
- for img in sorted(p.glob("*.png")):
41
- file_paths.append(str(img))
42
-
43
- images_dir = out_dir / "images" / kind
44
- if images_dir.exists():
45
- for img in sorted(images_dir.glob("*.jpg")):
46
- file_paths.append(str(img))
47
- else:
48
- for p in (out_dir / "charts").glob("*.png"):
49
- file_paths.append(str(p))
50
- for p in (out_dir / "tables").glob("*.png"):
51
- file_paths.append(str(p))
52
- for p in (out_dir / "images").rglob("*.jpg"):
53
- file_paths.append(str(p))
4
+ This module serves as the main entry point for the Doctra Gradio interface.
5
+ It imports and composes the modular UI components for a clean, maintainable structure.
54
6
 
55
- if allowed_kinds:
56
- if "charts" in allowed_kinds and "tables" in allowed_kinds:
57
- excel_files = ["parsed_tables_charts.xlsx"]
58
- elif "charts" in allowed_kinds:
59
- excel_files = ["parsed_charts.xlsx"]
60
- elif "tables" in allowed_kinds:
61
- excel_files = ["parsed_tables.xlsx"]
62
- else:
63
- excel_files = []
64
-
65
- for excel_file in excel_files:
66
- excel_path = out_dir / excel_file
67
- if excel_path.exists():
68
- file_paths.append(str(excel_path))
7
+ The application is organized into the following modules:
8
+ - ui_helpers.py: Shared utilities, constants, and helper functions
9
+ - full_parse_ui.py: Full PDF parsing functionality with page navigation
10
+ - tables_charts_ui.py: Table and chart extraction with VLM integration
11
+ - docres_ui.py: Image restoration functionality with before/after comparison
12
+ - enhanced_parser_ui.py: Enhanced PDF parsing with DocRes image restoration
69
13
 
70
- kinds = allowed_kinds if allowed_kinds else ["tables", "charts", "figures"]
71
- for sub in kinds:
72
- p = out_dir / sub
73
- if p.exists():
74
- for img in sorted(p.glob("*.png")):
75
- gallery_items.append((str(img), f"{sub}: {img.name}"))
76
-
77
- images_dir = out_dir / "images" / sub
78
- if images_dir.exists():
79
- for img in sorted(images_dir.glob("*.jpg")):
80
- gallery_items.append((str(img), f"{sub}: {img.name}"))
14
+ Each module is self-contained with its own state management and event handlers,
15
+ making the codebase easier to navigate, test, and extend.
16
+ """
81
17
 
82
- tmp_zip_dir = Path(tempfile.mkdtemp(prefix="doctra_zip_"))
83
-
84
- if zip_filename:
85
- safe_filename = re.sub(r'[<>:"/\\|?*]', '_', zip_filename)
86
- zip_base = tmp_zip_dir / safe_filename
87
- else:
88
- zip_base = tmp_zip_dir / "doctra_outputs"
89
-
90
- filtered_dir = tmp_zip_dir / "filtered_outputs"
91
- shutil.copytree(out_dir, filtered_dir, ignore=shutil.ignore_patterns('~$*', '*.tmp', '*.temp'))
92
-
93
- zip_path = shutil.make_archive(str(zip_base), 'zip', root_dir=str(filtered_dir))
18
+ import gradio as gr
94
19
 
95
- return gallery_items, file_paths, zip_path
20
+ from doctra.ui.ui_helpers import THEME, CUSTOM_CSS, create_tips_markdown
21
+ from doctra.ui.full_parse_ui import create_full_parse_tab
22
+ from doctra.ui.tables_charts_ui import create_tables_charts_tab
23
+ from doctra.ui.docres_ui import create_docres_tab
24
+ from doctra.ui.enhanced_parser_ui import create_enhanced_parser_tab
96
25
 
97
26
 
98
- def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
99
- """
100
- Parse markdown content and organize it by pages.
101
- Returns a list of page dictionaries with content, tables, charts, and figures.
27
+ def build_demo() -> gr.Blocks:
102
28
  """
103
- import re
104
-
105
- pages = []
106
- current_page = None
107
-
108
- lines = md_content.split('\n')
109
- i = 0
110
-
111
- while i < len(lines):
112
- line = lines[i].strip()
113
-
114
- if line.startswith('## Page '):
115
- if current_page:
116
- pages.append(current_page)
117
-
118
- page_num = line.replace('## Page ', '').strip()
119
- current_page = {
120
- 'page_num': page_num,
121
- 'content': [],
122
- 'tables': [],
123
- 'charts': [],
124
- 'figures': [],
125
- 'images': [],
126
- 'full_content': [] # Store full content with inline images
127
- }
128
- i += 1
129
- continue
130
-
131
- if line.startswith('![') and '](images/' in line:
132
- match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
133
- if match:
134
- caption = match.group(1)
135
- img_path = match.group(2)
136
-
137
- if 'Table' in caption:
138
- current_page['tables'].append({'caption': caption, 'path': img_path})
139
- elif 'Chart' in caption:
140
- current_page['charts'].append({'caption': caption, 'path': img_path})
141
- elif 'Figure' in caption:
142
- current_page['figures'].append({'caption': caption, 'path': img_path})
143
-
144
- current_page['images'].append({'caption': caption, 'path': img_path})
145
-
146
- current_page['full_content'].append(f"![{caption}]({img_path})")
147
-
148
- elif current_page:
149
- if line:
150
- current_page['content'].append(line)
151
- current_page['full_content'].append(line)
152
-
153
- i += 1
154
-
155
- if current_page:
156
- pages.append(current_page)
29
+ Build the main Doctra Gradio interface using modular components.
157
30
 
158
- return pages
159
-
160
-
161
- def run_full_parse(
162
- pdf_file: str,
163
- use_vlm: bool,
164
- vlm_provider: str,
165
- vlm_api_key: str,
166
- layout_model_name: str,
167
- dpi: int,
168
- min_score: float,
169
- ocr_lang: str,
170
- ocr_psm: int,
171
- ocr_oem: int,
172
- ocr_extra_config: str,
173
- box_separator: str,
174
- ) -> Tuple[str, Optional[str], List[tuple[str, str]], List[str], str]:
175
- if not pdf_file:
176
- return ("No file provided.", None, [], [], "")
177
-
178
- original_filename = Path(pdf_file).stem
179
-
180
- tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
181
- input_pdf = tmp_dir / f"{original_filename}.pdf"
182
- shutil.copy2(pdf_file, input_pdf)
183
-
184
- # Validate VLM configuration
185
- if use_vlm and not vlm_api_key:
186
- return ("❌ Error: VLM API key is required when using VLM", None, [], [], "")
187
-
188
- if use_vlm and vlm_api_key:
189
- # Basic API key validation
190
- if len(vlm_api_key.strip()) < 10:
191
- return ("❌ Error: VLM API key appears to be too short or invalid", None, [], [], "")
192
- if vlm_api_key.strip().startswith('sk-') and len(vlm_api_key.strip()) < 20:
193
- return ("❌ Error: OpenAI API key appears to be invalid (too short)", None, [], [], "")
194
-
195
- parser = StructuredPDFParser(
196
- use_vlm=use_vlm,
197
- vlm_provider=vlm_provider,
198
- vlm_api_key=vlm_api_key or None,
199
- layout_model_name=layout_model_name,
200
- dpi=int(dpi),
201
- min_score=float(min_score),
202
- ocr_lang=ocr_lang,
203
- ocr_psm=int(ocr_psm),
204
- ocr_oem=int(ocr_oem),
205
- ocr_extra_config=ocr_extra_config or "",
206
- box_separator=box_separator or "\n",
207
- )
208
-
209
- try:
210
- parser.parse(str(input_pdf))
211
- except Exception as e:
212
- import traceback
213
- traceback.print_exc()
214
- # Safely encode error message for return value
215
- try:
216
- error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
217
- return (f"❌ VLM processing failed: {error_msg}", None, [], [], "")
218
- except Exception:
219
- return (f"❌ VLM processing failed: <Unicode encoding error>", None, [], [], "")
220
-
221
- outputs_root = Path("outputs")
222
- out_dir = outputs_root / original_filename / "full_parse"
223
- if not out_dir.exists():
224
- # fallback: search latest created dir under outputs
225
- candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
226
- if candidates:
227
- out_dir = candidates[0] / "full_parse"
228
- else:
229
- out_dir = outputs_root
230
-
231
- md_file = next(out_dir.glob("*.md"), None)
232
- md_preview = None
233
- if md_file and md_file.exists():
234
- try:
235
- with md_file.open("r", encoding="utf-8", errors="ignore") as f:
236
- md_preview = f.read() # Return the full markdown content
237
- except Exception:
238
- md_preview = None
239
-
240
- gallery_items, file_paths, zip_path = _gather_outputs(out_dir, zip_filename=original_filename, is_structured_parsing=False)
241
- return (f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}", md_preview, gallery_items, file_paths, zip_path)
242
-
243
-
244
- def run_extract(
245
- pdf_file: str,
246
- target: str,
247
- use_vlm: bool,
248
- vlm_provider: str,
249
- vlm_api_key: str,
250
- layout_model_name: str,
251
- dpi: int,
252
- min_score: float,
253
- ) -> Tuple[str, str, List[tuple[str, str]], List[str], str]:
254
- if not pdf_file:
255
- return ("No file provided.", "", [], [], "")
256
-
257
- # Validate VLM configuration
258
- if use_vlm and not vlm_api_key:
259
- return ("❌ Error: VLM API key is required when using VLM", "", [], [], "")
260
-
261
- if use_vlm and vlm_api_key:
262
- # Basic API key validation
263
- if len(vlm_api_key.strip()) < 10:
264
- return ("❌ Error: VLM API key appears to be too short or invalid", "", [], [], "")
265
- if vlm_api_key.strip().startswith('sk-') and len(vlm_api_key.strip()) < 20:
266
- return ("❌ Error: OpenAI API key appears to be invalid (too short)", "", [], [], "")
267
-
268
- # Extract filename from the uploaded file path
269
- original_filename = Path(pdf_file).stem
270
-
271
- tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
272
- input_pdf = tmp_dir / f"{original_filename}.pdf"
273
- shutil.copy2(pdf_file, input_pdf)
274
-
275
- parser = ChartTablePDFParser(
276
- extract_charts=(target in ("charts", "both")),
277
- extract_tables=(target in ("tables", "both")),
278
- use_vlm=use_vlm,
279
- vlm_provider=vlm_provider,
280
- vlm_api_key=vlm_api_key or None,
281
- layout_model_name=layout_model_name,
282
- dpi=int(dpi),
283
- min_score=float(min_score),
284
- )
285
-
286
- output_base = Path("outputs")
287
- parser.parse(str(input_pdf), str(output_base))
288
-
289
- outputs_root = output_base
290
- out_dir = outputs_root / original_filename / "structured_parsing"
291
- if not out_dir.exists():
292
- if outputs_root.exists():
293
- candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
294
- if candidates:
295
- out_dir = candidates[0] / "structured_parsing"
296
- else:
297
- out_dir = outputs_root
298
- else:
299
- outputs_root.mkdir(parents=True, exist_ok=True)
300
- out_dir = outputs_root
301
-
302
- # Determine which kinds to include in outputs based on target selection
303
- allowed_kinds: Optional[List[str]] = None
304
- if target in ("tables", "charts"):
305
- allowed_kinds = [target]
306
- elif target == "both":
307
- allowed_kinds = ["tables", "charts"]
308
-
309
- gallery_items, file_paths, zip_path = _gather_outputs(out_dir, allowed_kinds, zip_filename=original_filename, is_structured_parsing=True)
310
-
311
- # Build tables HTML preview from Excel data (when VLM enabled)
312
- tables_html = ""
313
- try:
314
- if use_vlm:
315
- # Find Excel file based on target
316
- excel_filename = None
317
- if target in ("tables", "charts"):
318
- if target == "tables":
319
- excel_filename = "parsed_tables.xlsx"
320
- else: # charts
321
- excel_filename = "parsed_charts.xlsx"
322
- elif target == "both":
323
- excel_filename = "parsed_tables_charts.xlsx"
324
-
325
- if excel_filename:
326
- excel_path = out_dir / excel_filename
327
- if excel_path.exists():
328
- import pandas as pd
329
- import html as _html
330
-
331
- # Read Excel file and create HTML tables
332
- xl_file = pd.ExcelFile(excel_path)
333
- html_blocks = []
334
-
335
- for sheet_name in xl_file.sheet_names:
336
- df = pd.read_excel(excel_path, sheet_name=sheet_name)
337
- if not df.empty:
338
- # Create table with title
339
- title = f"<h3>{_html.escape(sheet_name)}</h3>"
340
-
341
- # Convert DataFrame to HTML table
342
- table_html = df.to_html(
343
- classes="doc-table",
344
- table_id=None,
345
- escape=True,
346
- index=False,
347
- na_rep=""
348
- )
349
-
350
- html_blocks.append(title + table_html)
351
-
352
- tables_html = "\n".join(html_blocks)
353
- except Exception as e:
354
- # Safely encode error message to handle Unicode characters
355
- try:
356
- error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
357
- print(f"Error building tables HTML: {error_msg}")
358
- except Exception:
359
- print(f"Error building tables HTML: <Unicode encoding error>")
360
- tables_html = ""
361
-
362
- return (f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}", tables_html, gallery_items, file_paths, zip_path)
363
-
364
-
365
- THEME = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
366
-
367
- CUSTOM_CSS = """
368
- .gradio-container {max-width: 100% !important; padding-left: 24px; padding-right: 24px}
369
- .container {max-width: 100% !important}
370
- .app {max-width: 100% !important}
371
- .header {margin-bottom: 8px}
372
- .subtitle {color: var(--body-text-color-subdued)}
373
- .card {border:1px solid var(--border-color); border-radius:12px; padding:8px}
374
- .status-ok {color: var(--color-success)}
375
-
376
- /* Page content styling */
377
- .page-content img {
378
- max-width: 100% !important;
379
- height: auto !important;
380
- display: block !important;
381
- margin: 10px auto !important;
382
- border: 1px solid #ddd !important;
383
- border-radius: 8px !important;
384
- box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
385
- }
386
-
387
- .page-content {
388
- max-height: none !important;
389
- overflow: visible !important;
390
- }
391
-
392
- /* Table styling */
393
- .page-content table.doc-table {
394
- width: 100% !important;
395
- border-collapse: collapse !important;
396
- margin: 12px 0 !important;
397
- }
398
- .page-content table.doc-table th,
399
- .page-content table.doc-table td {
400
- border: 1px solid #e5e7eb !important;
401
- padding: 8px 10px !important;
402
- text-align: left !important;
403
- }
404
- .page-content table.doc-table thead th {
405
- background: #f9fafb !important;
406
- font-weight: 600 !important;
407
- }
408
- .page-content table.doc-table tbody tr:nth-child(even) td {
409
- background: #fafafa !important;
410
- }
411
-
412
- /* Clickable image buttons */
413
- .image-button {
414
- background: #0066cc !important;
415
- color: white !important;
416
- border: none !important;
417
- padding: 5px 10px !important;
418
- border-radius: 4px !important;
419
- cursor: pointer !important;
420
- margin: 2px !important;
421
- font-size: 14px !important;
422
- }
423
-
424
- .image-button:hover {
425
- background: #0052a3 !important;
426
- }
427
- """
428
-
429
-
430
- def build_demo() -> gr.Blocks:
31
+ Returns:
32
+ Configured Gradio Blocks interface
33
+ """
431
34
  with gr.Blocks(title="Doctra - Document Parser", theme=THEME, css=CUSTOM_CSS) as demo:
35
+ # Header section
432
36
  gr.Markdown(
433
37
  """
434
38
  <div class="header">
@@ -438,548 +42,23 @@ def build_demo() -> gr.Blocks:
438
42
  """
439
43
  )
440
44
 
45
+ # Create modular tabs
46
+ full_parse_tab, full_parse_state = create_full_parse_tab()
47
+ tables_charts_tab, tables_charts_state = create_tables_charts_tab()
48
+ docres_tab, docres_state = create_docres_tab()
49
+ enhanced_parser_tab, enhanced_parser_state = create_enhanced_parser_tab()
441
50
 
442
- with gr.Tab("Full Parse"):
443
- with gr.Row():
444
- pdf = gr.File(file_types=[".pdf"], label="PDF")
445
- use_vlm = gr.Checkbox(label="Use VLM (optional)", value=False)
446
- vlm_provider = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
447
- vlm_api_key = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
448
-
449
- with gr.Accordion("Advanced", open=False):
450
- with gr.Row():
451
- layout_model = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
452
- dpi = gr.Slider(100, 400, value=200, step=10, label="DPI")
453
- min_score = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
454
- with gr.Row():
455
- ocr_lang = gr.Textbox(value="eng", label="OCR Language")
456
- ocr_psm = gr.Slider(0, 13, value=4, step=1, label="Tesseract PSM")
457
- ocr_oem = gr.Slider(0, 3, value=3, step=1, label="Tesseract OEM")
458
- with gr.Row():
459
- ocr_config = gr.Textbox(value="", label="Extra OCR config")
460
- box_sep = gr.Textbox(value="\n", label="Box separator")
461
-
462
- run_btn = gr.Button("▶ Run Full Parse", variant="primary")
463
- status = gr.Textbox(label="Status", elem_classes=["status-ok"])
464
-
465
- # Page selector for extracted content
466
- page_selector = gr.Dropdown(label="Select Page to Display", interactive=True, visible=False)
467
-
468
- # Full Parse components
469
- with gr.Row():
470
- with gr.Column():
471
- md_preview = gr.HTML(label="Extracted Content", visible=True, elem_classes=["page-content"])
472
- with gr.Column():
473
- page_image = gr.Image(label="Page image", interactive=False)
474
- gallery = gr.Gallery(label="Extracted images (tables/charts/figures)", columns=4, height=420, preview=True)
475
- files_out = gr.Files(label="Download individual output files")
476
- zip_out = gr.File(label="Download all outputs (ZIP)")
477
-
478
- # Hidden state to store pages data and all images
479
- pages_state = gr.State([])
480
- all_images_state = gr.State([])
481
- pdf_path_state = gr.State("")
482
- page_images_state = gr.State([]) # list of file paths per page index (1-based)
483
-
484
- # Hidden components for image filtering
485
- filter_trigger = gr.Button(visible=False)
486
- current_image_path = gr.State("")
487
- current_image_caption = gr.State("")
488
- image_filter_input = gr.Textbox(visible=False, elem_id="image_filter_input")
489
-
490
- def parse_markdown_by_pages(md_content: str):
491
- """Parse markdown content and organize it by pages."""
492
- import re
493
-
494
- pages = []
495
- current_page = None
496
-
497
- lines = md_content.split('\n')
498
- i = 0
499
-
500
-
501
- # First, let's find all page headers
502
- page_headers = []
503
- for i, line in enumerate(lines):
504
- if line.strip().startswith('## Page '):
505
- page_num = line.strip().replace('## Page ', '').strip()
506
- page_headers.append((i, page_num, line))
507
-
508
-
509
- # Now parse content for each page
510
- for i, (line_idx, page_num, header_line) in enumerate(page_headers):
511
- # Find the end of this page (start of next page or end of document)
512
- start_line = line_idx
513
- if i + 1 < len(page_headers):
514
- end_line = page_headers[i + 1][0]
515
- else:
516
- end_line = len(lines)
517
-
518
- # Extract content for this page
519
- page_content = lines[start_line:end_line]
520
-
521
- page = {
522
- 'page_num': page_num,
523
- 'content': page_content
524
- }
525
- pages.append(page)
526
-
527
- return pages
528
-
529
- def update_page_selector(pages_data):
530
- """Update the page selector dropdown with available pages."""
531
- if not pages_data:
532
- return gr.Dropdown(choices=[], value=None, visible=False)
533
-
534
- page_choices = [f"Page {page['page_num']}" for page in pages_data]
535
- return gr.Dropdown(choices=page_choices, value=page_choices[0], visible=True)
536
-
537
- def display_selected_page(selected_page, pages_data, pdf_path, page_images):
538
- """Display the content of the selected page and the rendered page image."""
539
- if not selected_page or not pages_data:
540
- return "", None
541
-
542
-
543
- # Find the selected page
544
- page_num = selected_page.replace("Page ", "")
545
- page = next((p for p in pages_data if p['page_num'] == page_num), None)
546
-
547
- if not page:
548
- return "Page not found", None
549
-
550
- # Build HTML with inline base64 images, render markdown tables, and preserve paragraphs/line breaks
551
- import html as _html, base64, re as _re
552
- base_dir = None
553
- try:
554
- stem = Path(pdf_path).stem if pdf_path else ""
555
- if stem:
556
- base_dir = Path("outputs") / stem / "full_parse"
557
- except Exception:
558
- base_dir = None
559
- processed_content = []
560
- paragraph_buffer = []
561
- def flush_paragraph():
562
- nonlocal paragraph_buffer
563
- if paragraph_buffer:
564
- joined = '<br/>'.join(_html.escape(l) for l in paragraph_buffer)
565
- processed_content.append(f'<p>{joined}</p>')
566
- paragraph_buffer = []
567
-
568
- # Simple markdown table detection and rendering
569
- def is_md_table_header(s: str) -> bool:
570
- return '|' in s and ('---' in s or '—' in s)
571
-
572
- def render_md_table(lines: List[str]) -> str:
573
- rows = [l.strip().strip('|').split('|') for l in lines]
574
- rows = [[_html.escape(c.strip()) for c in r] for r in rows]
575
- if len(rows) < 2:
576
- return ""
577
- header = rows[0]
578
- body = rows[2:] if len(rows) > 2 else []
579
- thead = '<thead><tr>' + ''.join(f'<th>{c}</th>' for c in header) + '</tr></thead>'
580
- tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{c}</td>' for c in r) + '</tr>' for r in body) + '</tbody>'
581
- return f'<table class="doc-table">{thead}{tbody}</table>'
582
-
583
- i = 0
584
- lines = page['content']
585
- n = len(lines)
586
- while i < n:
587
- raw_line = lines[i]
588
- line = raw_line.rstrip('\r\n')
589
- stripped = line.strip()
590
- if stripped.startswith('![') and ('](images/' in stripped or '](images\\' in stripped):
591
- flush_paragraph()
592
- match = _re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
593
- if match and base_dir is not None:
594
- caption = match.group(1)
595
- rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
596
- abs_path = (base_dir / rel_path).resolve()
597
- try:
598
- with open(abs_path, 'rb') as f:
599
- b64 = base64.b64encode(f.read()).decode('ascii')
600
- processed_content.append(f'<figure><img src="data:image/jpeg;base64,{b64}" alt="{_html.escape(caption)}"/><figcaption>{_html.escape(caption)}</figcaption></figure>')
601
- except Exception:
602
- processed_content.append(f'<div>{_html.escape(caption)} (image not found)</div>')
603
- else:
604
- paragraph_buffer.append(raw_line)
605
- i += 1
606
- continue
607
-
608
- # Detect markdown table blocks - only if line starts with | or has multiple | characters
609
- if (stripped.startswith('|') or stripped.count('|') >= 2) and i + 1 < n and is_md_table_header(lines[i + 1]):
610
- flush_paragraph()
611
- table_block = [stripped]
612
- i += 1
613
- table_block.append(lines[i].strip())
614
- i += 1
615
- while i < n:
616
- nxt = lines[i].rstrip('\r\n')
617
- if nxt.strip() == '' or (not nxt.strip().startswith('|') and nxt.count('|') < 2):
618
- break
619
- table_block.append(nxt.strip())
620
- i += 1
621
- html_table = render_md_table(table_block)
622
- if html_table:
623
- processed_content.append(html_table)
624
- else:
625
- for tl in table_block:
626
- paragraph_buffer.append(tl)
627
- continue
628
-
629
- if stripped.startswith('## '):
630
- flush_paragraph()
631
- processed_content.append(f'<h3>{_html.escape(stripped[3:])}</h3>')
632
- elif stripped.startswith('# '):
633
- flush_paragraph()
634
- processed_content.append(f'<h2>{_html.escape(stripped[2:])}</h2>')
635
- elif stripped == '':
636
- flush_paragraph()
637
- processed_content.append('<br/>')
638
- else:
639
- paragraph_buffer.append(raw_line)
640
- i += 1
641
- flush_paragraph()
642
-
643
- # Join the processed content lines
644
- content = "\n".join(processed_content)
645
-
646
- # Ensure page images are prepared
647
- try:
648
- if pdf_path and not page_images:
649
- from doctra.utils.pdf_io import render_pdf_to_images
650
- tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
651
- pil_pages = render_pdf_to_images(pdf_path)
652
- saved_paths: List[str] = []
653
- for idx, (im, _, _) in enumerate(pil_pages, start=1):
654
- out_path = tmp_img_dir / f"page_{idx:03d}.jpg"
655
- im.save(out_path, format="JPEG", quality=90)
656
- saved_paths.append(str(out_path))
657
- page_images = saved_paths
658
- page_images_state.value = saved_paths # cache
659
- except Exception as e:
660
- pass
661
-
662
- # Select image for the current page number (1-based)
663
- page_img = None
664
- try:
665
- page_index = int(page_num)
666
- if page_images and 1 <= page_index <= len(page_images):
667
- page_img = page_images[page_index - 1]
668
- except Exception:
669
- page_img = None
670
-
671
- return content, page_img
672
-
673
- def filter_gallery_by_image(img_path, caption, all_images):
674
- """Filter gallery to show only the selected image."""
675
- if not img_path or not all_images:
676
- return all_images
677
-
678
- # Find the selected image
679
- filtered_images = []
680
- for stored_img_path, stored_caption in all_images:
681
- if stored_caption == caption:
682
- filtered_images.append((stored_img_path, stored_caption))
683
- break
684
-
685
- return filtered_images
686
-
687
- def trigger_image_filter(filter_input):
688
- """Trigger image filtering when input changes."""
689
- if not filter_input:
690
- return "", ""
691
-
692
- # Parse the input (format: "img_path|caption")
693
- parts = filter_input.split("|", 1)
694
- if len(parts) == 2:
695
- img_path, caption = parts
696
- return img_path, caption
697
- return "", ""
698
-
699
- def filter_gallery_by_trigger(img_path, caption, all_images):
700
- """Filter gallery based on trigger values."""
701
- if not img_path or not caption or not all_images:
702
- return all_images
703
-
704
- # Find the selected image
705
- filtered_images = []
706
- for stored_img_path, stored_caption in all_images:
707
- if stored_caption == caption:
708
- filtered_images.append((stored_img_path, stored_caption))
709
- break
710
-
711
- return filtered_images
712
-
713
- def run_full_parse_with_pages(*args):
714
- """Run full parse and parse the markdown into pages."""
715
- result = run_full_parse(*args)
716
- status_msg, md_content, gallery_items, file_paths, zip_path = result
717
-
718
- # Parse markdown into pages
719
- pages_data = []
720
- first_page_content = ""
721
- all_images = []
722
- if md_content:
723
- pages_data = parse_markdown_by_pages(md_content)
724
-
725
- # Collect all images from all pages
726
- for page in pages_data:
727
- for line in page['content']:
728
- if line.strip().startswith('![') and ('](images/' in line or '](images\\' in line):
729
- import re
730
- match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line.strip())
731
- if match:
732
- caption = match.group(1)
733
- img_path = match.group(2)
734
- all_images.append((img_path, caption))
735
-
736
-
737
- # Show only Page 1 content initially
738
- if pages_data:
739
- first_page = pages_data[0]
740
- first_page_content = "\n".join(first_page['content'])
741
-
742
- # Prepare first page image immediately and cache page images
743
- input_pdf_path = args[0]
744
- first_page_image = None
745
- saved_paths: List[str] = []
746
- try:
747
- if input_pdf_path:
748
- from doctra.utils.pdf_io import render_pdf_to_images
749
- tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
750
- pil_pages = render_pdf_to_images(input_pdf_path)
751
- for idx, (im, _, _) in enumerate(pil_pages, start=1):
752
- out_path = tmp_img_dir / f"page_{idx:03d}.jpg"
753
- im.save(out_path, format="JPEG", quality=90)
754
- saved_paths.append(str(out_path))
755
- if saved_paths:
756
- first_page_image = saved_paths[0]
757
- except Exception as e:
758
- pass
759
-
760
- # Build initial HTML with inline images and proper blocks for first page
761
- if pages_data:
762
- import html as _html, base64, re as _re
763
- base_dir = None
764
- try:
765
- stem = Path(input_pdf_path).stem if input_pdf_path else ""
766
- if stem:
767
- base_dir = Path("outputs") / stem / "full_parse"
768
- except Exception:
769
- base_dir = None
770
- html_lines: List[str] = []
771
- for raw_line in pages_data[0]['content']:
772
- line = raw_line.strip()
773
- if line.startswith('![') and ('](images/' in line or '](images\\' in line):
774
- match = _re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
775
- if match and base_dir is not None:
776
- caption = match.group(1)
777
- rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
778
- abs_path = (base_dir / rel_path).resolve()
779
- try:
780
- with open(abs_path, 'rb') as f:
781
- b64 = base64.b64encode(f.read()).decode('ascii')
782
- html_lines.append(f'<figure><img src="data:image/jpeg;base64,{b64}" alt="{_html.escape(caption)}"/><figcaption>{_html.escape(caption)}</figcaption></figure>')
783
- except Exception:
784
- html_lines.append(f'<div>{_html.escape(caption)} (image not found)</div>')
785
- else:
786
- html_lines.append(f'<p>{_html.escape(raw_line)}</p>')
787
- else:
788
- if line.startswith('## '):
789
- html_lines.append(f'<h3>{_html.escape(line[3:])}</h3>')
790
- elif line.startswith('# '):
791
- html_lines.append(f'<h2>{_html.escape(line[2:])}</h2>')
792
- elif line == '':
793
- html_lines.append('<br/>')
794
- else:
795
- html_lines.append(f'<p>{_html.escape(raw_line)}</p>')
796
- first_page_content = "\n".join(html_lines)
797
-
798
- return status_msg, first_page_content, first_page_image, gallery_items, file_paths, zip_path, pages_data, all_images, input_pdf_path, saved_paths
799
-
800
- run_btn.click(
801
- fn=run_full_parse_with_pages,
802
- inputs=[pdf, use_vlm, vlm_provider, vlm_api_key, layout_model, dpi, min_score, ocr_lang, ocr_psm, ocr_oem, ocr_config, box_sep],
803
- outputs=[status, md_preview, page_image, gallery, files_out, zip_out, pages_state, all_images_state, pdf_path_state, page_images_state],
804
- ).then(
805
- fn=update_page_selector,
806
- inputs=[pages_state],
807
- outputs=[page_selector],
808
- )
809
-
810
- page_selector.change(
811
- fn=display_selected_page,
812
- inputs=[page_selector, pages_state, pdf_path_state, page_images_state],
813
- outputs=[md_preview, page_image],
814
- )
815
-
816
- image_filter_input.change(
817
- fn=trigger_image_filter,
818
- inputs=[image_filter_input],
819
- outputs=[current_image_path, current_image_caption],
820
- ).then(
821
- fn=filter_gallery_by_trigger,
822
- inputs=[current_image_path, current_image_caption, all_images_state],
823
- outputs=[gallery],
824
- )
825
-
826
- with gr.Tab("Extract Tables/Charts"):
827
- with gr.Row():
828
- pdf_e = gr.File(file_types=[".pdf"], label="PDF")
829
- target = gr.Dropdown(["tables", "charts", "both"], value="both", label="Target")
830
- use_vlm_e = gr.Checkbox(label="Use VLM (optional)", value=False)
831
- vlm_provider_e = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
832
- vlm_api_key_e = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
833
- with gr.Accordion("Advanced", open=False):
834
- with gr.Row():
835
- layout_model_e = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
836
- dpi_e = gr.Slider(100, 400, value=200, step=10, label="DPI")
837
- min_score_e = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
838
-
839
- run_btn_e = gr.Button("▶ Run Extraction", variant="primary")
840
- status_e = gr.Textbox(label="Status")
841
- # Dropdown to select specific item
842
- item_selector_e = gr.Dropdown(label="Select Item", visible=False, interactive=True)
843
-
844
- # Display extracted data and images
845
- with gr.Row():
846
- tables_preview_e = gr.HTML(label="Extracted Data", elem_classes=["page-content"])
847
- image_e = gr.Image(label="Selected Image", interactive=False)
848
-
849
- # Keep gallery for reference but make it smaller
850
- gallery_e = gr.Gallery(label="All Extracted Images", columns=4, height=200, preview=True)
851
- files_out_e = gr.Files(label="Download individual output files")
852
- zip_out_e = gr.File(label="Download all outputs (ZIP)")
853
-
854
- # State to store output directory
855
- out_dir_state = gr.State("")
856
-
857
- def capture_out_dir(status_text):
858
- if not status_text:
859
- return ""
860
- try:
861
- if "Output directory:" in status_text:
862
- return status_text.split("Output directory:", 1)[1].strip()
863
- except Exception:
864
- pass
865
- return ""
866
-
867
- def build_item_selector(out_dir_path, target, use_vlm):
868
- if not out_dir_path or not use_vlm:
869
- return gr.Dropdown(choices=[], value=None, visible=False)
870
-
871
- try:
872
- out_dir = Path(out_dir_path)
873
- mapping = out_dir / "vlm_items.json"
874
- if not mapping.exists():
875
- return gr.Dropdown(choices=[], value=None, visible=False)
876
-
877
- import json
878
- data = json.loads(mapping.read_text(encoding="utf-8"))
879
- choices = []
880
-
881
- for entry in data:
882
- kind = entry.get("kind")
883
- # Filter based on target
884
- if target == "both" or (target == "tables" and kind == "table") or (target == "charts" and kind == "chart"):
885
- title = entry.get("title") or f"{kind.title()}"
886
- page = entry.get("page")
887
- rel_path = entry.get("image_rel_path")
888
- label = f"{title} — Page {page}"
889
- choices.append((label, rel_path))
890
-
891
- return gr.Dropdown(choices=choices, value=choices[0][1] if choices else None, visible=bool(choices))
892
- except Exception:
893
- return gr.Dropdown(choices=[], value=None, visible=False)
894
-
895
- def show_selected_item(rel_path, out_dir_path):
896
- if not rel_path or not out_dir_path:
897
- return "", None
898
-
899
- try:
900
- out_dir = Path(out_dir_path)
901
- mapping = out_dir / "vlm_items.json"
902
- if not mapping.exists():
903
- return "", None
904
-
905
- import json, html as _html
906
- data = json.loads(mapping.read_text(encoding="utf-8"))
907
-
908
- for entry in data:
909
- if entry.get("image_rel_path") == rel_path:
910
- headers = entry.get("headers") or []
911
- rows = entry.get("rows") or []
912
- title = entry.get("title") or "Data"
913
- kind = entry.get("kind", "table")
914
-
915
- # Create HTML table
916
- if headers and rows:
917
- thead = '<thead><tr>' + ''.join(f'<th>{_html.escape(str(h))}</th>' for h in headers) + '</tr></thead>'
918
- tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{_html.escape(str(c))}</td>' for c in r) + '</tr>' for r in rows) + '</tbody>'
919
- html_table = f'<h3>{_html.escape(title)} ({kind.title()})</h3><table class="doc-table">{thead}{tbody}</table>'
920
- else:
921
- html_table = f'<h3>{_html.escape(title)} ({kind.title()})</h3><p>No structured data available</p>'
922
-
923
- # Get image path
924
- img_abs = str((out_dir / rel_path).resolve())
925
- return html_table, img_abs
926
-
927
- return "", None
928
- except Exception:
929
- return "", None
930
-
931
- run_btn_e.click(
932
- fn=lambda f, t, a, b, c, d, e, g: run_extract(
933
- f.name if f else "",
934
- t,
935
- a,
936
- b,
937
- c,
938
- d,
939
- e,
940
- g,
941
- ),
942
- inputs=[pdf_e, target, use_vlm_e, vlm_provider_e, vlm_api_key_e, layout_model_e, dpi_e, min_score_e],
943
- outputs=[status_e, tables_preview_e, gallery_e, files_out_e, zip_out_e],
944
- ).then(
945
- fn=capture_out_dir,
946
- inputs=[status_e],
947
- outputs=[out_dir_state]
948
- ).then(
949
- fn=build_item_selector,
950
- inputs=[out_dir_state, target, use_vlm_e],
951
- outputs=[item_selector_e]
952
- ).then(
953
- fn=show_selected_item,
954
- inputs=[item_selector_e, out_dir_state],
955
- outputs=[tables_preview_e, image_e]
956
- )
957
-
958
- # Handle dropdown selection changes
959
- item_selector_e.change(
960
- fn=show_selected_item,
961
- inputs=[item_selector_e, out_dir_state],
962
- outputs=[tables_preview_e, image_e]
963
- )
964
-
965
-
966
- gr.Markdown(
967
- """
968
- <div class="card">
969
- <b>Tips</b>
970
- <ul>
971
- <li>On Spaces, set a secret <code>VLM_API_KEY</code> to enable VLM features.</li>
972
- <li>Outputs are saved under <code>outputs/&lt;pdf_stem&gt;/</code>.</li>
973
- </ul>
974
- </div>
975
- """
976
- )
51
+ # Tips section
52
+ gr.Markdown(create_tips_markdown())
977
53
 
978
54
  return demo
979
55
 
980
56
 
981
- def launch_ui(server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False):
57
+ def launch_ui():
58
+ """
59
+ Launch the Doctra Gradio interface.
60
+
61
+ This function creates and launches the main application interface.
62
+ """
982
63
  demo = build_demo()
983
- demo.launch(server_name=server_name, server_port=server_port, share=share)
984
-
985
-
64
+ demo.launch()