doctra 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. doctra/__init__.py +4 -0
  2. doctra/cli/main.py +168 -0
  3. doctra/engines/image_restoration/__init__.py +10 -0
  4. doctra/engines/image_restoration/docres_engine.py +566 -0
  5. doctra/engines/vlm/service.py +0 -12
  6. doctra/parsers/enhanced_pdf_parser.py +370 -0
  7. doctra/parsers/structured_pdf_parser.py +11 -60
  8. doctra/parsers/table_chart_extractor.py +8 -44
  9. doctra/third_party/docres/data/MBD/MBD.py +110 -0
  10. doctra/third_party/docres/data/MBD/MBD_utils.py +291 -0
  11. doctra/third_party/docres/data/MBD/infer.py +151 -0
  12. doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +95 -0
  13. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +13 -0
  14. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +402 -0
  15. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +151 -0
  16. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +170 -0
  17. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +288 -0
  18. doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +59 -0
  19. doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +81 -0
  20. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +12 -0
  21. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +282 -0
  22. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +129 -0
  23. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +88 -0
  24. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +29 -0
  25. doctra/third_party/docres/data/preprocess/crop_merge_image.py +142 -0
  26. doctra/third_party/docres/inference.py +370 -0
  27. doctra/third_party/docres/models/restormer_arch.py +308 -0
  28. doctra/third_party/docres/utils.py +464 -0
  29. doctra/ui/app.py +5 -32
  30. doctra/utils/progress.py +13 -98
  31. doctra/utils/structured_utils.py +45 -49
  32. doctra/version.py +1 -1
  33. {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/METADATA +1 -1
  34. doctra-0.4.0.dist-info/RECORD +67 -0
  35. doctra-0.3.2.dist-info/RECORD +0 -44
  36. {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/WHEEL +0 -0
  37. {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/licenses/LICENSE +0 -0
  38. {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,370 @@
1
+ """
2
+ Enhanced PDF Parser with Image Restoration
3
+
4
+ This module provides an enhanced PDF parser that combines the structured parsing
5
+ capabilities with DocRes image restoration for improved document processing.
6
+ """
7
+
8
+ from __future__ import annotations
9
+ import os
10
+ import sys
11
+ from typing import List, Dict, Any, Optional, Union
12
+ from contextlib import ExitStack
13
+ from PIL import Image
14
+ from tqdm import tqdm
15
+
16
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
17
+ from doctra.engines.image_restoration import DocResEngine
18
+ from doctra.utils.pdf_io import render_pdf_to_images
19
+ from doctra.utils.constants import IMAGE_SUBDIRS
20
+ from doctra.utils.file_ops import ensure_output_dirs
21
+ from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
22
+
23
+
24
+ class EnhancedPDFParser(StructuredPDFParser):
25
+ """
26
+ Enhanced PDF Parser with Image Restoration capabilities.
27
+
28
+ Extends the StructuredPDFParser with DocRes image restoration to improve
29
+ document quality before processing. This is particularly useful for:
30
+ - Scanned documents with shadows or distortion
31
+ - Low-quality PDFs that need enhancement
32
+ - Documents with perspective issues
33
+
34
+ :param use_image_restoration: Whether to apply DocRes image restoration (default: True)
35
+ :param restoration_task: DocRes task to use ("dewarping", "deshadowing", "appearance", "deblurring", "binarization", "end2end", default: "appearance")
36
+ :param restoration_device: Device for DocRes processing ("cuda", "cpu", or None for auto-detect, default: None)
37
+ :param restoration_dpi: DPI for restoration processing (default: 200)
38
+ :param use_vlm: Whether to use VLM for structured data extraction (default: False)
39
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
40
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
41
+ :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
42
+ :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
43
+ :param dpi: DPI for PDF rendering (default: 200)
44
+ :param min_score: Minimum confidence score for layout detection (default: 0.0)
45
+ :param ocr_lang: OCR language code (default: "eng")
46
+ :param ocr_psm: Tesseract page segmentation mode (default: 4)
47
+ :param ocr_oem: Tesseract OCR engine mode (default: 3)
48
+ :param ocr_extra_config: Additional Tesseract configuration (default: "")
49
+ :param box_separator: Separator between text boxes in output (default: "\n")
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ *,
55
+ use_image_restoration: bool = True,
56
+ restoration_task: str = "appearance",
57
+ restoration_device: Optional[str] = None,
58
+ restoration_dpi: int = 200,
59
+ use_vlm: bool = False,
60
+ vlm_provider: str = "gemini",
61
+ vlm_model: str | None = None,
62
+ vlm_api_key: str | None = None,
63
+ layout_model_name: str = "PP-DocLayout_plus-L",
64
+ dpi: int = 200,
65
+ min_score: float = 0.0,
66
+ ocr_lang: str = "eng",
67
+ ocr_psm: int = 4,
68
+ ocr_oem: int = 3,
69
+ ocr_extra_config: str = "",
70
+ box_separator: str = "\n",
71
+ ):
72
+ """
73
+ Initialize the Enhanced PDF Parser with image restoration capabilities.
74
+ """
75
+ # Initialize parent class
76
+ super().__init__(
77
+ use_vlm=use_vlm,
78
+ vlm_provider=vlm_provider,
79
+ vlm_model=vlm_model,
80
+ vlm_api_key=vlm_api_key,
81
+ layout_model_name=layout_model_name,
82
+ dpi=dpi,
83
+ min_score=min_score,
84
+ ocr_lang=ocr_lang,
85
+ ocr_psm=ocr_psm,
86
+ ocr_oem=ocr_oem,
87
+ ocr_extra_config=ocr_extra_config,
88
+ box_separator=box_separator,
89
+ )
90
+
91
+ # Image restoration settings
92
+ self.use_image_restoration = use_image_restoration
93
+ self.restoration_task = restoration_task
94
+ self.restoration_device = restoration_device
95
+ self.restoration_dpi = restoration_dpi
96
+
97
+ # Initialize DocRes engine if needed
98
+ self.docres_engine = None
99
+ if self.use_image_restoration:
100
+ try:
101
+ self.docres_engine = DocResEngine(
102
+ device=restoration_device,
103
+ use_half_precision=True
104
+ )
105
+ print(f"✅ DocRes engine initialized with task: {restoration_task}")
106
+ except Exception as e:
107
+ print(f"⚠️ DocRes initialization failed: {e}")
108
+ print(" Continuing without image restoration...")
109
+ self.use_image_restoration = False
110
+ self.docres_engine = None
111
+
112
+ def parse(self, pdf_path: str, enhanced_output_dir: str = None) -> None:
113
+ """
114
+ Parse a PDF document with optional image restoration.
115
+
116
+ :param pdf_path: Path to the input PDF file
117
+ :param enhanced_output_dir: Directory for enhanced images (if None, uses default)
118
+ :return: None
119
+ """
120
+ pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
121
+
122
+ # Set up output directories
123
+ if enhanced_output_dir is None:
124
+ out_dir = f"outputs/{pdf_filename}/enhanced_parse"
125
+ else:
126
+ out_dir = enhanced_output_dir
127
+
128
+ os.makedirs(out_dir, exist_ok=True)
129
+ ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
130
+
131
+ # Process PDF pages with optional restoration
132
+ if self.use_image_restoration and self.docres_engine:
133
+ print(f"🔄 Processing PDF with image restoration: {os.path.basename(pdf_path)}")
134
+ enhanced_pages = self._process_pages_with_restoration(pdf_path, out_dir)
135
+ else:
136
+ print(f"🔄 Processing PDF without image restoration: {os.path.basename(pdf_path)}")
137
+ enhanced_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
138
+
139
+ # Run layout detection on enhanced pages
140
+ print("🔍 Running layout detection on enhanced pages...")
141
+ pages = self.layout_engine.predict_pdf(
142
+ pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
143
+ )
144
+
145
+ # Use enhanced pages for processing
146
+ pil_pages = enhanced_pages
147
+
148
+ # Continue with standard parsing logic
149
+ self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename)
150
+
151
+ def _process_pages_with_restoration(self, pdf_path: str, out_dir: str) -> List[Image.Image]:
152
+ """
153
+ Process PDF pages with DocRes image restoration.
154
+
155
+ :param pdf_path: Path to the input PDF file
156
+ :param out_dir: Output directory for enhanced images
157
+ :return: List of enhanced PIL images
158
+ """
159
+ # Render original pages
160
+ original_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.restoration_dpi)]
161
+
162
+ if not original_pages:
163
+ print("❌ No pages found in PDF")
164
+ return []
165
+
166
+ # Create progress bar
167
+ is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
168
+ if is_notebook:
169
+ progress_bar = create_notebook_friendly_bar(
170
+ total=len(original_pages),
171
+ desc=f"🔄 DocRes {self.restoration_task}"
172
+ )
173
+ else:
174
+ progress_bar = create_beautiful_progress_bar(
175
+ total=len(original_pages),
176
+ desc=f"🔄 DocRes {self.restoration_task}",
177
+ leave=True
178
+ )
179
+
180
+ enhanced_pages = []
181
+ enhanced_dir = os.path.join(out_dir, "enhanced_pages")
182
+ os.makedirs(enhanced_dir, exist_ok=True)
183
+
184
+ try:
185
+ with progress_bar:
186
+ for i, page_img in enumerate(original_pages):
187
+ try:
188
+ # Convert PIL to numpy array
189
+ import numpy as np
190
+ img_array = np.array(page_img)
191
+
192
+ # Apply DocRes restoration
193
+ restored_img, metadata = self.docres_engine.restore_image(
194
+ img_array,
195
+ task=self.restoration_task
196
+ )
197
+
198
+ # Convert back to PIL Image
199
+ enhanced_page = Image.fromarray(restored_img)
200
+ enhanced_pages.append(enhanced_page)
201
+
202
+ # Save enhanced page for reference
203
+ enhanced_path = os.path.join(enhanced_dir, f"page_{i+1:03d}_enhanced.jpg")
204
+ enhanced_page.save(enhanced_path, "JPEG", quality=95)
205
+
206
+ progress_bar.set_description(f"✅ Page {i+1}/{len(original_pages)} enhanced")
207
+ progress_bar.update(1)
208
+
209
+ except Exception as e:
210
+ print(f" ⚠️ Page {i+1} restoration failed: {e}, using original")
211
+ enhanced_pages.append(page_img)
212
+ progress_bar.set_description(f"⚠️ Page {i+1} failed, using original")
213
+ progress_bar.update(1)
214
+
215
+ finally:
216
+ if hasattr(progress_bar, 'close'):
217
+ progress_bar.close()
218
+
219
+ print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
220
+ return enhanced_pages
221
+
222
+ def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename):
223
+ """
224
+ Process the parsing logic with enhanced pages.
225
+ This is extracted from the parent class to allow customization.
226
+ """
227
+ from doctra.utils.constants import EXCLUDE_LABELS
228
+ from doctra.parsers.layout_order import reading_order_key
229
+ from doctra.utils.ocr_utils import ocr_box_text
230
+ from doctra.exporters.image_saver import save_box_image
231
+ from doctra.exporters.markdown_writer import write_markdown
232
+ from doctra.exporters.html_writer import write_html
233
+ from doctra.exporters.excel_writer import write_structured_excel
234
+ from doctra.exporters.html_writer import write_structured_html
235
+ from doctra.utils.structured_utils import to_structured_dict
236
+ from doctra.exporters.markdown_table import render_markdown_table
237
+
238
+ fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
239
+ chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
240
+ table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
241
+
242
+ md_lines: List[str] = ["# Enhanced Document Content\n"]
243
+ structured_items: List[Dict[str, Any]] = []
244
+
245
+ charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
246
+ tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
247
+ figures_desc = "Figures (cropped)"
248
+
249
+ with ExitStack() as stack:
250
+ is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
251
+ if is_notebook:
252
+ charts_bar = stack.enter_context(
253
+ create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
254
+ tables_bar = stack.enter_context(
255
+ create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
256
+ figures_bar = stack.enter_context(
257
+ create_notebook_friendly_bar(total=fig_count, desc=figures_desc)) if fig_count else None
258
+ else:
259
+ charts_bar = stack.enter_context(
260
+ create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
261
+ tables_bar = stack.enter_context(
262
+ create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
263
+ figures_bar = stack.enter_context(
264
+ create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
265
+
266
+ for p in pages:
267
+ page_num = p.page_index
268
+ page_img: Image.Image = pil_pages[page_num - 1]
269
+ md_lines.append(f"\n## Page {page_num}\n")
270
+
271
+ for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
272
+ if box.label in EXCLUDE_LABELS:
273
+ img_path = save_box_image(page_img, box, out_dir, page_num, i, IMAGE_SUBDIRS)
274
+ abs_img_path = os.path.abspath(img_path)
275
+ rel = os.path.relpath(abs_img_path, out_dir)
276
+
277
+ if box.label == "figure":
278
+ md_lines.append(f"![Figure — page {page_num}]({rel})\n")
279
+ if figures_bar: figures_bar.update(1)
280
+
281
+ elif box.label == "chart":
282
+ if self.use_vlm and self.vlm:
283
+ wrote_table = False
284
+ try:
285
+ chart = self.vlm.extract_chart(abs_img_path)
286
+ item = to_structured_dict(chart)
287
+ if item:
288
+ structured_items.append(item)
289
+ md_lines.append(
290
+ render_markdown_table(item.get("headers"), item.get("rows"),
291
+ title=item.get("title"))
292
+ )
293
+ wrote_table = True
294
+ except Exception as e:
295
+ pass
296
+ if not wrote_table:
297
+ md_lines.append(f"![Chart — page {page_num}]({rel})\n")
298
+ else:
299
+ md_lines.append(f"![Chart — page {page_num}]({rel})\n")
300
+ if charts_bar: charts_bar.update(1)
301
+
302
+ elif box.label == "table":
303
+ if self.use_vlm and self.vlm:
304
+ wrote_table = False
305
+ try:
306
+ table = self.vlm.extract_table(abs_img_path)
307
+ item = to_structured_dict(table)
308
+ if item:
309
+ structured_items.append(item)
310
+ md_lines.append(
311
+ render_markdown_table(item.get("headers"), item.get("rows"),
312
+ title=item.get("title"))
313
+ )
314
+ wrote_table = True
315
+ except Exception as e:
316
+ pass
317
+ if not wrote_table:
318
+ md_lines.append(f"![Table — page {page_num}]({rel})\n")
319
+ else:
320
+ md_lines.append(f"![Table — page {page_num}]({rel})\n")
321
+ if tables_bar: tables_bar.update(1)
322
+ else:
323
+ text = ocr_box_text(self.ocr_engine, page_img, box)
324
+ if text:
325
+ md_lines.append(text)
326
+ md_lines.append(self.box_separator if self.box_separator else "")
327
+
328
+ md_path = write_markdown(md_lines, out_dir)
329
+ html_path = write_html(md_lines, out_dir)
330
+
331
+ excel_path = None
332
+ html_structured_path = None
333
+ if self.use_vlm and structured_items:
334
+ excel_path = os.path.join(out_dir, "tables.xlsx")
335
+ write_structured_excel(excel_path, structured_items)
336
+ html_structured_path = os.path.join(out_dir, "tables.html")
337
+ write_structured_html(html_structured_path, structured_items)
338
+
339
+ print(f"✅ Enhanced parsing completed successfully!")
340
+ print(f"📁 Output directory: {out_dir}")
341
+
342
+ def restore_pdf_only(self, pdf_path: str, output_path: str = None, task: str = None) -> str:
343
+ """
344
+ Apply DocRes restoration to a PDF without parsing.
345
+
346
+ :param pdf_path: Path to the input PDF file
347
+ :param output_path: Path for the enhanced PDF (if None, auto-generates)
348
+ :param task: DocRes restoration task (if None, uses instance default)
349
+ :return: Path to the enhanced PDF or None if failed
350
+ """
351
+ if not self.use_image_restoration or not self.docres_engine:
352
+ raise RuntimeError("Image restoration is not enabled or DocRes engine is not available")
353
+
354
+ task = task or self.restoration_task
355
+ return self.docres_engine.restore_pdf(pdf_path, output_path, task, self.restoration_dpi)
356
+
357
+ def get_restoration_info(self) -> Dict[str, Any]:
358
+ """
359
+ Get information about the current restoration configuration.
360
+
361
+ :return: Dictionary with restoration settings and status
362
+ """
363
+ return {
364
+ 'enabled': self.use_image_restoration,
365
+ 'task': self.restoration_task,
366
+ 'device': self.restoration_device,
367
+ 'dpi': self.restoration_dpi,
368
+ 'engine_available': self.docres_engine is not None,
369
+ 'supported_tasks': self.docres_engine.get_supported_tasks() if self.docres_engine else []
370
+ }
@@ -64,22 +64,19 @@ class StructuredPDFParser:
64
64
  ):
65
65
  """
66
66
  Initialize the StructuredPDFParser with processing configuration.
67
-
68
- Sets up the layout detection engine, OCR engine, and optionally
69
- the VLM service for comprehensive document processing.
70
67
 
71
- :param use_vlm: Whether to use VLM for structured data extraction
72
- :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
68
+ :param use_vlm: Whether to use VLM for structured data extraction (default: False)
69
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
73
70
  :param vlm_model: Model name to use (defaults to provider-specific defaults)
74
- :param vlm_api_key: API key for VLM provider
75
- :param layout_model_name: Layout detection model name
76
- :param dpi: DPI for PDF rendering
77
- :param min_score: Minimum confidence score for layout detection
78
- :param ocr_lang: OCR language code
79
- :param ocr_psm: Tesseract page segmentation mode
80
- :param ocr_oem: Tesseract OCR engine mode
81
- :param ocr_extra_config: Additional Tesseract configuration
82
- :param box_separator: Separator between text boxes in output
71
+ :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
72
+ :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
73
+ :param dpi: DPI for PDF rendering (default: 200)
74
+ :param min_score: Minimum confidence score for layout detection (default: 0.0)
75
+ :param ocr_lang: OCR language code (default: "eng")
76
+ :param ocr_psm: Tesseract page segmentation mode (default: 4)
77
+ :param ocr_oem: Tesseract OCR engine mode (default: 3)
78
+ :param ocr_extra_config: Additional Tesseract configuration (default: "")
79
+ :param box_separator: Separator between text boxes in output (default: "\n")
83
80
  """
84
81
  self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
85
82
  self.dpi = dpi
@@ -100,15 +97,10 @@ class StructuredPDFParser:
100
97
  def parse(self, pdf_path: str) -> None:
101
98
  """
102
99
  Parse a PDF document and extract all content types.
103
-
104
- Processes the PDF through layout detection, extracts text using OCR,
105
- saves images for visual elements, and optionally converts charts/tables
106
- to structured data using VLM.
107
100
 
108
101
  :param pdf_path: Path to the input PDF file
109
102
  :return: None
110
103
  """
111
- # Extract filename without extension and create output directory
112
104
  pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
113
105
  out_dir = f"outputs/{pdf_filename}/full_parse"
114
106
 
@@ -120,7 +112,6 @@ class StructuredPDFParser:
120
112
  )
121
113
  pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
122
114
 
123
- # Count for progress bars
124
115
  fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
125
116
  chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
126
117
  table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
@@ -133,11 +124,8 @@ class StructuredPDFParser:
133
124
  figures_desc = "Figures (cropped)"
134
125
 
135
126
  with ExitStack() as stack:
136
- # Enhanced environment detection
137
127
  is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
138
128
  is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
139
-
140
- # Use appropriate progress bars based on environment
141
129
  if is_notebook:
142
130
  charts_bar = stack.enter_context(
143
131
  create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
@@ -165,13 +153,11 @@ class StructuredPDFParser:
165
153
  rel = os.path.relpath(abs_img_path, out_dir)
166
154
 
167
155
  if box.label == "figure":
168
- # Figures are always images in MD
169
156
  md_lines.append(f"![Figure — page {page_num}]({rel})\n")
170
157
  if figures_bar: figures_bar.update(1)
171
158
 
172
159
  elif box.label == "chart":
173
160
  if self.use_vlm and self.vlm:
174
- # Try structured → Markdown table; fallback to image if it fails
175
161
  wrote_table = False
176
162
  try:
177
163
  chart = self.vlm.extract_chart(abs_img_path)
@@ -193,7 +179,6 @@ class StructuredPDFParser:
193
179
 
194
180
  elif box.label == "table":
195
181
  if self.use_vlm and self.vlm:
196
- # Try structured → Markdown table; fallback to image if it fails
197
182
  wrote_table = False
198
183
  try:
199
184
  table = self.vlm.extract_table(abs_img_path)
@@ -229,7 +214,6 @@ class StructuredPDFParser:
229
214
  html_structured_path = os.path.join(out_dir, "tables.html")
230
215
  write_structured_html(html_structured_path, structured_items)
231
216
 
232
- # Print completion message with output directory
233
217
  print(f"✅ Parsing completed successfully!")
234
218
  print(f"📁 Output directory: {out_dir}")
235
219
 
@@ -249,30 +233,25 @@ class StructuredPDFParser:
249
233
  :param save_path: Optional path to save the visualization (if None, displays only)
250
234
  :return: None
251
235
  """
252
- # Get layout predictions
253
236
  pages: List[LayoutPage] = self.layout_engine.predict_pdf(
254
237
  pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
255
238
  )
256
239
  pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
257
240
 
258
- # Limit to requested number of pages
259
241
  pages_to_show = min(num_pages, len(pages))
260
242
 
261
243
  if pages_to_show == 0:
262
244
  print("No pages to display")
263
245
  return
264
246
 
265
- # Calculate grid dimensions
266
247
  rows = (pages_to_show + cols - 1) // cols
267
248
 
268
- # Collect unique labels from the processed pages and assign colors
269
249
  used_labels = set()
270
250
  for idx in range(pages_to_show):
271
251
  page = pages[idx]
272
252
  for box in page.boxes:
273
253
  used_labels.add(box.label.lower())
274
254
 
275
- # Create dynamic color assignment for all detected labels
276
255
  base_colors = ['#3B82F6', '#EF4444', '#10B981', '#F59E0B', '#8B5CF6',
277
256
  '#F97316', '#EC4899', '#6B7280', '#84CC16', '#06B6D4',
278
257
  '#DC2626', '#059669', '#7C3AED', '#DB2777', '#0891B2']
@@ -281,22 +260,18 @@ class StructuredPDFParser:
281
260
  for i, label in enumerate(sorted(used_labels)):
282
261
  dynamic_label_colors[label] = base_colors[i % len(base_colors)]
283
262
 
284
- # Process each page and add bounding boxes
285
263
  processed_pages = []
286
264
 
287
265
  for idx in range(pages_to_show):
288
266
  page = pages[idx]
289
267
  page_img = pil_pages[idx].copy()
290
268
 
291
- # Calculate scale factor to resize to target width
292
269
  scale_factor = page_width / page_img.width
293
270
  new_height = int(page_img.height * scale_factor)
294
271
  page_img = page_img.resize((page_width, new_height), Image.LANCZOS)
295
272
 
296
- # Create drawing context
297
273
  draw = ImageDraw.Draw(page_img)
298
274
 
299
- # Try to load a nice font, fallback to default
300
275
  try:
301
276
  font = ImageFont.truetype("arial.ttf", 24)
302
277
  small_font = ImageFont.truetype("arial.ttf", 18)
@@ -308,21 +283,16 @@ class StructuredPDFParser:
308
283
  font = None
309
284
  small_font = None
310
285
 
311
- # Draw bounding boxes
312
286
  for box in page.boxes:
313
- # Scale coordinates
314
287
  x1 = int(box.x1 * scale_factor)
315
288
  y1 = int(box.y1 * scale_factor)
316
289
  x2 = int(box.x2 * scale_factor)
317
290
  y2 = int(box.y2 * scale_factor)
318
291
 
319
- # Get color for this label from dynamic assignment
320
292
  color = dynamic_label_colors.get(box.label.lower(), '#000000')
321
293
 
322
- # Draw rectangle with rounded corners effect
323
294
  draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
324
295
 
325
- # Draw label background
326
296
  label_text = f"{box.label} ({box.score:.2f})"
327
297
  if font:
328
298
  bbox = draw.textbbox((0, 0), label_text, font=small_font)
@@ -332,11 +302,9 @@ class StructuredPDFParser:
332
302
  text_width = len(label_text) * 8
333
303
  text_height = 15
334
304
 
335
- # Position label above the box
336
305
  label_x = x1
337
306
  label_y = max(0, y1 - text_height - 8)
338
307
 
339
- # Draw label background with padding
340
308
  padding = 4
341
309
  draw.rectangle([
342
310
  label_x - padding,
@@ -345,10 +313,8 @@ class StructuredPDFParser:
345
313
  label_y + text_height + padding
346
314
  ], fill='white', outline=color, width=2)
347
315
 
348
- # Draw label text
349
316
  draw.text((label_x, label_y), label_text, fill=color, font=small_font)
350
317
 
351
- # Add page title
352
318
  title_text = f"Page {page.page_index} ({len(page.boxes)} boxes)"
353
319
  if font:
354
320
  title_bbox = draw.textbbox((0, 0), title_text, font=font)
@@ -356,7 +322,6 @@ class StructuredPDFParser:
356
322
  else:
357
323
  title_width = len(title_text) * 12
358
324
 
359
- # Draw title background
360
325
  title_x = (page_width - title_width) // 2
361
326
  title_y = 10
362
327
  draw.rectangle([title_x - 10, title_y - 5, title_x + title_width + 10, title_y + 35],
@@ -365,16 +330,13 @@ class StructuredPDFParser:
365
330
 
366
331
  processed_pages.append(page_img)
367
332
 
368
- # Create grid layout with space for legend
369
333
  legend_width = 250
370
334
  grid_width = cols * page_width + (cols - 1) * spacing
371
335
  total_width = grid_width + legend_width + spacing
372
336
  grid_height = rows * (processed_pages[0].height if processed_pages else 600) + (rows - 1) * spacing
373
337
 
374
- # Create final grid image with modern background
375
338
  final_img = Image.new('RGB', (total_width, grid_height), '#F8FAFC')
376
339
 
377
- # Place pages in grid
378
340
  for idx, page_img in enumerate(processed_pages):
379
341
  row = idx // cols
380
342
  col = idx % cols
@@ -384,13 +346,11 @@ class StructuredPDFParser:
384
346
 
385
347
  final_img.paste(page_img, (x_pos, y_pos))
386
348
 
387
- # Create legend
388
349
  legend_x = grid_width + spacing
389
350
  legend_y = 20
390
351
 
391
352
  draw_legend = ImageDraw.Draw(final_img)
392
353
 
393
- # Legend title
394
354
  legend_title = "Element Types"
395
355
  if font:
396
356
  title_bbox = draw_legend.textbbox((0, 0), legend_title, font=font)
@@ -400,47 +360,38 @@ class StructuredPDFParser:
400
360
  title_width = len(legend_title) * 12
401
361
  title_height = 20
402
362
 
403
- # Draw legend background
404
363
  legend_bg_height = len(used_labels) * 35 + title_height + 40
405
364
  draw_legend.rectangle([legend_x - 10, legend_y - 10,
406
365
  legend_x + legend_width - 10, legend_y + legend_bg_height],
407
366
  fill='white', outline='#E5E7EB', width=2)
408
367
 
409
- # Draw legend title
410
368
  draw_legend.text((legend_x + 10, legend_y + 5), legend_title,
411
369
  fill='#1F2937', font=font)
412
370
 
413
- # Draw legend items - now using dynamic colors for actually detected labels
414
371
  current_y = legend_y + title_height + 20
415
372
 
416
373
  for label in sorted(used_labels):
417
374
  color = dynamic_label_colors[label]
418
375
 
419
- # Draw color square
420
376
  square_size = 20
421
377
  draw_legend.rectangle([legend_x + 10, current_y,
422
378
  legend_x + 10 + square_size, current_y + square_size],
423
379
  fill=color, outline='#6B7280', width=1)
424
380
 
425
- # Draw label text
426
381
  draw_legend.text((legend_x + 40, current_y + 2), label.title(),
427
382
  fill='#374151', font=small_font)
428
383
 
429
384
  current_y += 30
430
385
 
431
- # Save or display
432
386
  if save_path:
433
387
  final_img.save(save_path, quality=95, optimize=True)
434
388
  print(f"Layout visualization saved to: {save_path}")
435
389
  else:
436
- # Display using PIL's default viewer
437
390
  final_img.show()
438
391
 
439
- # Print summary statistics
440
392
  print(f"\n📊 Layout Detection Summary for {os.path.basename(pdf_path)}:")
441
393
  print(f"Pages processed: {pages_to_show}")
442
394
 
443
- # Create summary by label across all pages
444
395
  total_counts = {}
445
396
  for idx in range(pages_to_show):
446
397
  page = pages[idx]