doctra 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +4 -0
- doctra/cli/main.py +168 -0
- doctra/engines/image_restoration/__init__.py +10 -0
- doctra/engines/image_restoration/docres_engine.py +566 -0
- doctra/engines/vlm/service.py +0 -12
- doctra/parsers/enhanced_pdf_parser.py +370 -0
- doctra/parsers/structured_pdf_parser.py +11 -60
- doctra/parsers/table_chart_extractor.py +8 -44
- doctra/third_party/docres/data/MBD/MBD.py +110 -0
- doctra/third_party/docres/data/MBD/MBD_utils.py +291 -0
- doctra/third_party/docres/data/MBD/infer.py +151 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +95 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +13 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +402 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +151 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +170 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +288 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +59 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +81 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +12 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +282 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +129 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +88 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +29 -0
- doctra/third_party/docres/data/preprocess/crop_merge_image.py +142 -0
- doctra/third_party/docres/inference.py +370 -0
- doctra/third_party/docres/models/restormer_arch.py +308 -0
- doctra/third_party/docres/utils.py +464 -0
- doctra/ui/app.py +5 -32
- doctra/utils/progress.py +13 -98
- doctra/utils/structured_utils.py +45 -49
- doctra/version.py +1 -1
- {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/METADATA +1 -1
- doctra-0.4.0.dist-info/RECORD +67 -0
- doctra-0.3.2.dist-info/RECORD +0 -44
- {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/WHEEL +0 -0
- {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,370 @@
|
|
1
|
+
"""
|
2
|
+
Enhanced PDF Parser with Image Restoration
|
3
|
+
|
4
|
+
This module provides an enhanced PDF parser that combines the structured parsing
|
5
|
+
capabilities with DocRes image restoration for improved document processing.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
import os
|
10
|
+
import sys
|
11
|
+
from typing import List, Dict, Any, Optional, Union
|
12
|
+
from contextlib import ExitStack
|
13
|
+
from PIL import Image
|
14
|
+
from tqdm import tqdm
|
15
|
+
|
16
|
+
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
|
17
|
+
from doctra.engines.image_restoration import DocResEngine
|
18
|
+
from doctra.utils.pdf_io import render_pdf_to_images
|
19
|
+
from doctra.utils.constants import IMAGE_SUBDIRS
|
20
|
+
from doctra.utils.file_ops import ensure_output_dirs
|
21
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
22
|
+
|
23
|
+
|
24
|
+
class EnhancedPDFParser(StructuredPDFParser):
|
25
|
+
"""
|
26
|
+
Enhanced PDF Parser with Image Restoration capabilities.
|
27
|
+
|
28
|
+
Extends the StructuredPDFParser with DocRes image restoration to improve
|
29
|
+
document quality before processing. This is particularly useful for:
|
30
|
+
- Scanned documents with shadows or distortion
|
31
|
+
- Low-quality PDFs that need enhancement
|
32
|
+
- Documents with perspective issues
|
33
|
+
|
34
|
+
:param use_image_restoration: Whether to apply DocRes image restoration (default: True)
|
35
|
+
:param restoration_task: DocRes task to use ("dewarping", "deshadowing", "appearance", "deblurring", "binarization", "end2end", default: "appearance")
|
36
|
+
:param restoration_device: Device for DocRes processing ("cuda", "cpu", or None for auto-detect, default: None)
|
37
|
+
:param restoration_dpi: DPI for restoration processing (default: 200)
|
38
|
+
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
39
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
40
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
41
|
+
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
42
|
+
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
43
|
+
:param dpi: DPI for PDF rendering (default: 200)
|
44
|
+
:param min_score: Minimum confidence score for layout detection (default: 0.0)
|
45
|
+
:param ocr_lang: OCR language code (default: "eng")
|
46
|
+
:param ocr_psm: Tesseract page segmentation mode (default: 4)
|
47
|
+
:param ocr_oem: Tesseract OCR engine mode (default: 3)
|
48
|
+
:param ocr_extra_config: Additional Tesseract configuration (default: "")
|
49
|
+
:param box_separator: Separator between text boxes in output (default: "\n")
|
50
|
+
"""
|
51
|
+
|
52
|
+
def __init__(
|
53
|
+
self,
|
54
|
+
*,
|
55
|
+
use_image_restoration: bool = True,
|
56
|
+
restoration_task: str = "appearance",
|
57
|
+
restoration_device: Optional[str] = None,
|
58
|
+
restoration_dpi: int = 200,
|
59
|
+
use_vlm: bool = False,
|
60
|
+
vlm_provider: str = "gemini",
|
61
|
+
vlm_model: str | None = None,
|
62
|
+
vlm_api_key: str | None = None,
|
63
|
+
layout_model_name: str = "PP-DocLayout_plus-L",
|
64
|
+
dpi: int = 200,
|
65
|
+
min_score: float = 0.0,
|
66
|
+
ocr_lang: str = "eng",
|
67
|
+
ocr_psm: int = 4,
|
68
|
+
ocr_oem: int = 3,
|
69
|
+
ocr_extra_config: str = "",
|
70
|
+
box_separator: str = "\n",
|
71
|
+
):
|
72
|
+
"""
|
73
|
+
Initialize the Enhanced PDF Parser with image restoration capabilities.
|
74
|
+
"""
|
75
|
+
# Initialize parent class
|
76
|
+
super().__init__(
|
77
|
+
use_vlm=use_vlm,
|
78
|
+
vlm_provider=vlm_provider,
|
79
|
+
vlm_model=vlm_model,
|
80
|
+
vlm_api_key=vlm_api_key,
|
81
|
+
layout_model_name=layout_model_name,
|
82
|
+
dpi=dpi,
|
83
|
+
min_score=min_score,
|
84
|
+
ocr_lang=ocr_lang,
|
85
|
+
ocr_psm=ocr_psm,
|
86
|
+
ocr_oem=ocr_oem,
|
87
|
+
ocr_extra_config=ocr_extra_config,
|
88
|
+
box_separator=box_separator,
|
89
|
+
)
|
90
|
+
|
91
|
+
# Image restoration settings
|
92
|
+
self.use_image_restoration = use_image_restoration
|
93
|
+
self.restoration_task = restoration_task
|
94
|
+
self.restoration_device = restoration_device
|
95
|
+
self.restoration_dpi = restoration_dpi
|
96
|
+
|
97
|
+
# Initialize DocRes engine if needed
|
98
|
+
self.docres_engine = None
|
99
|
+
if self.use_image_restoration:
|
100
|
+
try:
|
101
|
+
self.docres_engine = DocResEngine(
|
102
|
+
device=restoration_device,
|
103
|
+
use_half_precision=True
|
104
|
+
)
|
105
|
+
print(f"✅ DocRes engine initialized with task: {restoration_task}")
|
106
|
+
except Exception as e:
|
107
|
+
print(f"⚠️ DocRes initialization failed: {e}")
|
108
|
+
print(" Continuing without image restoration...")
|
109
|
+
self.use_image_restoration = False
|
110
|
+
self.docres_engine = None
|
111
|
+
|
112
|
+
def parse(self, pdf_path: str, enhanced_output_dir: str = None) -> None:
|
113
|
+
"""
|
114
|
+
Parse a PDF document with optional image restoration.
|
115
|
+
|
116
|
+
:param pdf_path: Path to the input PDF file
|
117
|
+
:param enhanced_output_dir: Directory for enhanced images (if None, uses default)
|
118
|
+
:return: None
|
119
|
+
"""
|
120
|
+
pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
121
|
+
|
122
|
+
# Set up output directories
|
123
|
+
if enhanced_output_dir is None:
|
124
|
+
out_dir = f"outputs/{pdf_filename}/enhanced_parse"
|
125
|
+
else:
|
126
|
+
out_dir = enhanced_output_dir
|
127
|
+
|
128
|
+
os.makedirs(out_dir, exist_ok=True)
|
129
|
+
ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
|
130
|
+
|
131
|
+
# Process PDF pages with optional restoration
|
132
|
+
if self.use_image_restoration and self.docres_engine:
|
133
|
+
print(f"🔄 Processing PDF with image restoration: {os.path.basename(pdf_path)}")
|
134
|
+
enhanced_pages = self._process_pages_with_restoration(pdf_path, out_dir)
|
135
|
+
else:
|
136
|
+
print(f"🔄 Processing PDF without image restoration: {os.path.basename(pdf_path)}")
|
137
|
+
enhanced_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
138
|
+
|
139
|
+
# Run layout detection on enhanced pages
|
140
|
+
print("🔍 Running layout detection on enhanced pages...")
|
141
|
+
pages = self.layout_engine.predict_pdf(
|
142
|
+
pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
|
143
|
+
)
|
144
|
+
|
145
|
+
# Use enhanced pages for processing
|
146
|
+
pil_pages = enhanced_pages
|
147
|
+
|
148
|
+
# Continue with standard parsing logic
|
149
|
+
self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename)
|
150
|
+
|
151
|
+
def _process_pages_with_restoration(self, pdf_path: str, out_dir: str) -> List[Image.Image]:
|
152
|
+
"""
|
153
|
+
Process PDF pages with DocRes image restoration.
|
154
|
+
|
155
|
+
:param pdf_path: Path to the input PDF file
|
156
|
+
:param out_dir: Output directory for enhanced images
|
157
|
+
:return: List of enhanced PIL images
|
158
|
+
"""
|
159
|
+
# Render original pages
|
160
|
+
original_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.restoration_dpi)]
|
161
|
+
|
162
|
+
if not original_pages:
|
163
|
+
print("❌ No pages found in PDF")
|
164
|
+
return []
|
165
|
+
|
166
|
+
# Create progress bar
|
167
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
168
|
+
if is_notebook:
|
169
|
+
progress_bar = create_notebook_friendly_bar(
|
170
|
+
total=len(original_pages),
|
171
|
+
desc=f"🔄 DocRes {self.restoration_task}"
|
172
|
+
)
|
173
|
+
else:
|
174
|
+
progress_bar = create_beautiful_progress_bar(
|
175
|
+
total=len(original_pages),
|
176
|
+
desc=f"🔄 DocRes {self.restoration_task}",
|
177
|
+
leave=True
|
178
|
+
)
|
179
|
+
|
180
|
+
enhanced_pages = []
|
181
|
+
enhanced_dir = os.path.join(out_dir, "enhanced_pages")
|
182
|
+
os.makedirs(enhanced_dir, exist_ok=True)
|
183
|
+
|
184
|
+
try:
|
185
|
+
with progress_bar:
|
186
|
+
for i, page_img in enumerate(original_pages):
|
187
|
+
try:
|
188
|
+
# Convert PIL to numpy array
|
189
|
+
import numpy as np
|
190
|
+
img_array = np.array(page_img)
|
191
|
+
|
192
|
+
# Apply DocRes restoration
|
193
|
+
restored_img, metadata = self.docres_engine.restore_image(
|
194
|
+
img_array,
|
195
|
+
task=self.restoration_task
|
196
|
+
)
|
197
|
+
|
198
|
+
# Convert back to PIL Image
|
199
|
+
enhanced_page = Image.fromarray(restored_img)
|
200
|
+
enhanced_pages.append(enhanced_page)
|
201
|
+
|
202
|
+
# Save enhanced page for reference
|
203
|
+
enhanced_path = os.path.join(enhanced_dir, f"page_{i+1:03d}_enhanced.jpg")
|
204
|
+
enhanced_page.save(enhanced_path, "JPEG", quality=95)
|
205
|
+
|
206
|
+
progress_bar.set_description(f"✅ Page {i+1}/{len(original_pages)} enhanced")
|
207
|
+
progress_bar.update(1)
|
208
|
+
|
209
|
+
except Exception as e:
|
210
|
+
print(f" ⚠️ Page {i+1} restoration failed: {e}, using original")
|
211
|
+
enhanced_pages.append(page_img)
|
212
|
+
progress_bar.set_description(f"⚠️ Page {i+1} failed, using original")
|
213
|
+
progress_bar.update(1)
|
214
|
+
|
215
|
+
finally:
|
216
|
+
if hasattr(progress_bar, 'close'):
|
217
|
+
progress_bar.close()
|
218
|
+
|
219
|
+
print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
|
220
|
+
return enhanced_pages
|
221
|
+
|
222
|
+
def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename):
|
223
|
+
"""
|
224
|
+
Process the parsing logic with enhanced pages.
|
225
|
+
This is extracted from the parent class to allow customization.
|
226
|
+
"""
|
227
|
+
from doctra.utils.constants import EXCLUDE_LABELS
|
228
|
+
from doctra.parsers.layout_order import reading_order_key
|
229
|
+
from doctra.utils.ocr_utils import ocr_box_text
|
230
|
+
from doctra.exporters.image_saver import save_box_image
|
231
|
+
from doctra.exporters.markdown_writer import write_markdown
|
232
|
+
from doctra.exporters.html_writer import write_html
|
233
|
+
from doctra.exporters.excel_writer import write_structured_excel
|
234
|
+
from doctra.exporters.html_writer import write_structured_html
|
235
|
+
from doctra.utils.structured_utils import to_structured_dict
|
236
|
+
from doctra.exporters.markdown_table import render_markdown_table
|
237
|
+
|
238
|
+
fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
|
239
|
+
chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
|
240
|
+
table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
|
241
|
+
|
242
|
+
md_lines: List[str] = ["# Enhanced Document Content\n"]
|
243
|
+
structured_items: List[Dict[str, Any]] = []
|
244
|
+
|
245
|
+
charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
|
246
|
+
tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
|
247
|
+
figures_desc = "Figures (cropped)"
|
248
|
+
|
249
|
+
with ExitStack() as stack:
|
250
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
251
|
+
if is_notebook:
|
252
|
+
charts_bar = stack.enter_context(
|
253
|
+
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
254
|
+
tables_bar = stack.enter_context(
|
255
|
+
create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
|
256
|
+
figures_bar = stack.enter_context(
|
257
|
+
create_notebook_friendly_bar(total=fig_count, desc=figures_desc)) if fig_count else None
|
258
|
+
else:
|
259
|
+
charts_bar = stack.enter_context(
|
260
|
+
create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
|
261
|
+
tables_bar = stack.enter_context(
|
262
|
+
create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
|
263
|
+
figures_bar = stack.enter_context(
|
264
|
+
create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
|
265
|
+
|
266
|
+
for p in pages:
|
267
|
+
page_num = p.page_index
|
268
|
+
page_img: Image.Image = pil_pages[page_num - 1]
|
269
|
+
md_lines.append(f"\n## Page {page_num}\n")
|
270
|
+
|
271
|
+
for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
|
272
|
+
if box.label in EXCLUDE_LABELS:
|
273
|
+
img_path = save_box_image(page_img, box, out_dir, page_num, i, IMAGE_SUBDIRS)
|
274
|
+
abs_img_path = os.path.abspath(img_path)
|
275
|
+
rel = os.path.relpath(abs_img_path, out_dir)
|
276
|
+
|
277
|
+
if box.label == "figure":
|
278
|
+
md_lines.append(f"\n")
|
279
|
+
if figures_bar: figures_bar.update(1)
|
280
|
+
|
281
|
+
elif box.label == "chart":
|
282
|
+
if self.use_vlm and self.vlm:
|
283
|
+
wrote_table = False
|
284
|
+
try:
|
285
|
+
chart = self.vlm.extract_chart(abs_img_path)
|
286
|
+
item = to_structured_dict(chart)
|
287
|
+
if item:
|
288
|
+
structured_items.append(item)
|
289
|
+
md_lines.append(
|
290
|
+
render_markdown_table(item.get("headers"), item.get("rows"),
|
291
|
+
title=item.get("title"))
|
292
|
+
)
|
293
|
+
wrote_table = True
|
294
|
+
except Exception as e:
|
295
|
+
pass
|
296
|
+
if not wrote_table:
|
297
|
+
md_lines.append(f"\n")
|
298
|
+
else:
|
299
|
+
md_lines.append(f"\n")
|
300
|
+
if charts_bar: charts_bar.update(1)
|
301
|
+
|
302
|
+
elif box.label == "table":
|
303
|
+
if self.use_vlm and self.vlm:
|
304
|
+
wrote_table = False
|
305
|
+
try:
|
306
|
+
table = self.vlm.extract_table(abs_img_path)
|
307
|
+
item = to_structured_dict(table)
|
308
|
+
if item:
|
309
|
+
structured_items.append(item)
|
310
|
+
md_lines.append(
|
311
|
+
render_markdown_table(item.get("headers"), item.get("rows"),
|
312
|
+
title=item.get("title"))
|
313
|
+
)
|
314
|
+
wrote_table = True
|
315
|
+
except Exception as e:
|
316
|
+
pass
|
317
|
+
if not wrote_table:
|
318
|
+
md_lines.append(f"\n")
|
319
|
+
else:
|
320
|
+
md_lines.append(f"\n")
|
321
|
+
if tables_bar: tables_bar.update(1)
|
322
|
+
else:
|
323
|
+
text = ocr_box_text(self.ocr_engine, page_img, box)
|
324
|
+
if text:
|
325
|
+
md_lines.append(text)
|
326
|
+
md_lines.append(self.box_separator if self.box_separator else "")
|
327
|
+
|
328
|
+
md_path = write_markdown(md_lines, out_dir)
|
329
|
+
html_path = write_html(md_lines, out_dir)
|
330
|
+
|
331
|
+
excel_path = None
|
332
|
+
html_structured_path = None
|
333
|
+
if self.use_vlm and structured_items:
|
334
|
+
excel_path = os.path.join(out_dir, "tables.xlsx")
|
335
|
+
write_structured_excel(excel_path, structured_items)
|
336
|
+
html_structured_path = os.path.join(out_dir, "tables.html")
|
337
|
+
write_structured_html(html_structured_path, structured_items)
|
338
|
+
|
339
|
+
print(f"✅ Enhanced parsing completed successfully!")
|
340
|
+
print(f"📁 Output directory: {out_dir}")
|
341
|
+
|
342
|
+
def restore_pdf_only(self, pdf_path: str, output_path: str = None, task: str = None) -> str:
|
343
|
+
"""
|
344
|
+
Apply DocRes restoration to a PDF without parsing.
|
345
|
+
|
346
|
+
:param pdf_path: Path to the input PDF file
|
347
|
+
:param output_path: Path for the enhanced PDF (if None, auto-generates)
|
348
|
+
:param task: DocRes restoration task (if None, uses instance default)
|
349
|
+
:return: Path to the enhanced PDF or None if failed
|
350
|
+
"""
|
351
|
+
if not self.use_image_restoration or not self.docres_engine:
|
352
|
+
raise RuntimeError("Image restoration is not enabled or DocRes engine is not available")
|
353
|
+
|
354
|
+
task = task or self.restoration_task
|
355
|
+
return self.docres_engine.restore_pdf(pdf_path, output_path, task, self.restoration_dpi)
|
356
|
+
|
357
|
+
def get_restoration_info(self) -> Dict[str, Any]:
|
358
|
+
"""
|
359
|
+
Get information about the current restoration configuration.
|
360
|
+
|
361
|
+
:return: Dictionary with restoration settings and status
|
362
|
+
"""
|
363
|
+
return {
|
364
|
+
'enabled': self.use_image_restoration,
|
365
|
+
'task': self.restoration_task,
|
366
|
+
'device': self.restoration_device,
|
367
|
+
'dpi': self.restoration_dpi,
|
368
|
+
'engine_available': self.docres_engine is not None,
|
369
|
+
'supported_tasks': self.docres_engine.get_supported_tasks() if self.docres_engine else []
|
370
|
+
}
|
@@ -64,22 +64,19 @@ class StructuredPDFParser:
|
|
64
64
|
):
|
65
65
|
"""
|
66
66
|
Initialize the StructuredPDFParser with processing configuration.
|
67
|
-
|
68
|
-
Sets up the layout detection engine, OCR engine, and optionally
|
69
|
-
the VLM service for comprehensive document processing.
|
70
67
|
|
71
|
-
:param use_vlm: Whether to use VLM for structured data extraction
|
72
|
-
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
|
68
|
+
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
69
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
73
70
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
74
|
-
:param vlm_api_key: API key for VLM provider
|
75
|
-
:param layout_model_name: Layout detection model name
|
76
|
-
:param dpi: DPI for PDF rendering
|
77
|
-
:param min_score: Minimum confidence score for layout detection
|
78
|
-
:param ocr_lang: OCR language code
|
79
|
-
:param ocr_psm: Tesseract page segmentation mode
|
80
|
-
:param ocr_oem: Tesseract OCR engine mode
|
81
|
-
:param ocr_extra_config: Additional Tesseract configuration
|
82
|
-
:param box_separator: Separator between text boxes in output
|
71
|
+
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
72
|
+
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
73
|
+
:param dpi: DPI for PDF rendering (default: 200)
|
74
|
+
:param min_score: Minimum confidence score for layout detection (default: 0.0)
|
75
|
+
:param ocr_lang: OCR language code (default: "eng")
|
76
|
+
:param ocr_psm: Tesseract page segmentation mode (default: 4)
|
77
|
+
:param ocr_oem: Tesseract OCR engine mode (default: 3)
|
78
|
+
:param ocr_extra_config: Additional Tesseract configuration (default: "")
|
79
|
+
:param box_separator: Separator between text boxes in output (default: "\n")
|
83
80
|
"""
|
84
81
|
self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
|
85
82
|
self.dpi = dpi
|
@@ -100,15 +97,10 @@ class StructuredPDFParser:
|
|
100
97
|
def parse(self, pdf_path: str) -> None:
|
101
98
|
"""
|
102
99
|
Parse a PDF document and extract all content types.
|
103
|
-
|
104
|
-
Processes the PDF through layout detection, extracts text using OCR,
|
105
|
-
saves images for visual elements, and optionally converts charts/tables
|
106
|
-
to structured data using VLM.
|
107
100
|
|
108
101
|
:param pdf_path: Path to the input PDF file
|
109
102
|
:return: None
|
110
103
|
"""
|
111
|
-
# Extract filename without extension and create output directory
|
112
104
|
pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
113
105
|
out_dir = f"outputs/{pdf_filename}/full_parse"
|
114
106
|
|
@@ -120,7 +112,6 @@ class StructuredPDFParser:
|
|
120
112
|
)
|
121
113
|
pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
122
114
|
|
123
|
-
# Count for progress bars
|
124
115
|
fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
|
125
116
|
chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
|
126
117
|
table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
|
@@ -133,11 +124,8 @@ class StructuredPDFParser:
|
|
133
124
|
figures_desc = "Figures (cropped)"
|
134
125
|
|
135
126
|
with ExitStack() as stack:
|
136
|
-
# Enhanced environment detection
|
137
127
|
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
138
128
|
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
139
|
-
|
140
|
-
# Use appropriate progress bars based on environment
|
141
129
|
if is_notebook:
|
142
130
|
charts_bar = stack.enter_context(
|
143
131
|
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
@@ -165,13 +153,11 @@ class StructuredPDFParser:
|
|
165
153
|
rel = os.path.relpath(abs_img_path, out_dir)
|
166
154
|
|
167
155
|
if box.label == "figure":
|
168
|
-
# Figures are always images in MD
|
169
156
|
md_lines.append(f"\n")
|
170
157
|
if figures_bar: figures_bar.update(1)
|
171
158
|
|
172
159
|
elif box.label == "chart":
|
173
160
|
if self.use_vlm and self.vlm:
|
174
|
-
# Try structured → Markdown table; fallback to image if it fails
|
175
161
|
wrote_table = False
|
176
162
|
try:
|
177
163
|
chart = self.vlm.extract_chart(abs_img_path)
|
@@ -193,7 +179,6 @@ class StructuredPDFParser:
|
|
193
179
|
|
194
180
|
elif box.label == "table":
|
195
181
|
if self.use_vlm and self.vlm:
|
196
|
-
# Try structured → Markdown table; fallback to image if it fails
|
197
182
|
wrote_table = False
|
198
183
|
try:
|
199
184
|
table = self.vlm.extract_table(abs_img_path)
|
@@ -229,7 +214,6 @@ class StructuredPDFParser:
|
|
229
214
|
html_structured_path = os.path.join(out_dir, "tables.html")
|
230
215
|
write_structured_html(html_structured_path, structured_items)
|
231
216
|
|
232
|
-
# Print completion message with output directory
|
233
217
|
print(f"✅ Parsing completed successfully!")
|
234
218
|
print(f"📁 Output directory: {out_dir}")
|
235
219
|
|
@@ -249,30 +233,25 @@ class StructuredPDFParser:
|
|
249
233
|
:param save_path: Optional path to save the visualization (if None, displays only)
|
250
234
|
:return: None
|
251
235
|
"""
|
252
|
-
# Get layout predictions
|
253
236
|
pages: List[LayoutPage] = self.layout_engine.predict_pdf(
|
254
237
|
pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
|
255
238
|
)
|
256
239
|
pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
257
240
|
|
258
|
-
# Limit to requested number of pages
|
259
241
|
pages_to_show = min(num_pages, len(pages))
|
260
242
|
|
261
243
|
if pages_to_show == 0:
|
262
244
|
print("No pages to display")
|
263
245
|
return
|
264
246
|
|
265
|
-
# Calculate grid dimensions
|
266
247
|
rows = (pages_to_show + cols - 1) // cols
|
267
248
|
|
268
|
-
# Collect unique labels from the processed pages and assign colors
|
269
249
|
used_labels = set()
|
270
250
|
for idx in range(pages_to_show):
|
271
251
|
page = pages[idx]
|
272
252
|
for box in page.boxes:
|
273
253
|
used_labels.add(box.label.lower())
|
274
254
|
|
275
|
-
# Create dynamic color assignment for all detected labels
|
276
255
|
base_colors = ['#3B82F6', '#EF4444', '#10B981', '#F59E0B', '#8B5CF6',
|
277
256
|
'#F97316', '#EC4899', '#6B7280', '#84CC16', '#06B6D4',
|
278
257
|
'#DC2626', '#059669', '#7C3AED', '#DB2777', '#0891B2']
|
@@ -281,22 +260,18 @@ class StructuredPDFParser:
|
|
281
260
|
for i, label in enumerate(sorted(used_labels)):
|
282
261
|
dynamic_label_colors[label] = base_colors[i % len(base_colors)]
|
283
262
|
|
284
|
-
# Process each page and add bounding boxes
|
285
263
|
processed_pages = []
|
286
264
|
|
287
265
|
for idx in range(pages_to_show):
|
288
266
|
page = pages[idx]
|
289
267
|
page_img = pil_pages[idx].copy()
|
290
268
|
|
291
|
-
# Calculate scale factor to resize to target width
|
292
269
|
scale_factor = page_width / page_img.width
|
293
270
|
new_height = int(page_img.height * scale_factor)
|
294
271
|
page_img = page_img.resize((page_width, new_height), Image.LANCZOS)
|
295
272
|
|
296
|
-
# Create drawing context
|
297
273
|
draw = ImageDraw.Draw(page_img)
|
298
274
|
|
299
|
-
# Try to load a nice font, fallback to default
|
300
275
|
try:
|
301
276
|
font = ImageFont.truetype("arial.ttf", 24)
|
302
277
|
small_font = ImageFont.truetype("arial.ttf", 18)
|
@@ -308,21 +283,16 @@ class StructuredPDFParser:
|
|
308
283
|
font = None
|
309
284
|
small_font = None
|
310
285
|
|
311
|
-
# Draw bounding boxes
|
312
286
|
for box in page.boxes:
|
313
|
-
# Scale coordinates
|
314
287
|
x1 = int(box.x1 * scale_factor)
|
315
288
|
y1 = int(box.y1 * scale_factor)
|
316
289
|
x2 = int(box.x2 * scale_factor)
|
317
290
|
y2 = int(box.y2 * scale_factor)
|
318
291
|
|
319
|
-
# Get color for this label from dynamic assignment
|
320
292
|
color = dynamic_label_colors.get(box.label.lower(), '#000000')
|
321
293
|
|
322
|
-
# Draw rectangle with rounded corners effect
|
323
294
|
draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
|
324
295
|
|
325
|
-
# Draw label background
|
326
296
|
label_text = f"{box.label} ({box.score:.2f})"
|
327
297
|
if font:
|
328
298
|
bbox = draw.textbbox((0, 0), label_text, font=small_font)
|
@@ -332,11 +302,9 @@ class StructuredPDFParser:
|
|
332
302
|
text_width = len(label_text) * 8
|
333
303
|
text_height = 15
|
334
304
|
|
335
|
-
# Position label above the box
|
336
305
|
label_x = x1
|
337
306
|
label_y = max(0, y1 - text_height - 8)
|
338
307
|
|
339
|
-
# Draw label background with padding
|
340
308
|
padding = 4
|
341
309
|
draw.rectangle([
|
342
310
|
label_x - padding,
|
@@ -345,10 +313,8 @@ class StructuredPDFParser:
|
|
345
313
|
label_y + text_height + padding
|
346
314
|
], fill='white', outline=color, width=2)
|
347
315
|
|
348
|
-
# Draw label text
|
349
316
|
draw.text((label_x, label_y), label_text, fill=color, font=small_font)
|
350
317
|
|
351
|
-
# Add page title
|
352
318
|
title_text = f"Page {page.page_index} ({len(page.boxes)} boxes)"
|
353
319
|
if font:
|
354
320
|
title_bbox = draw.textbbox((0, 0), title_text, font=font)
|
@@ -356,7 +322,6 @@ class StructuredPDFParser:
|
|
356
322
|
else:
|
357
323
|
title_width = len(title_text) * 12
|
358
324
|
|
359
|
-
# Draw title background
|
360
325
|
title_x = (page_width - title_width) // 2
|
361
326
|
title_y = 10
|
362
327
|
draw.rectangle([title_x - 10, title_y - 5, title_x + title_width + 10, title_y + 35],
|
@@ -365,16 +330,13 @@ class StructuredPDFParser:
|
|
365
330
|
|
366
331
|
processed_pages.append(page_img)
|
367
332
|
|
368
|
-
# Create grid layout with space for legend
|
369
333
|
legend_width = 250
|
370
334
|
grid_width = cols * page_width + (cols - 1) * spacing
|
371
335
|
total_width = grid_width + legend_width + spacing
|
372
336
|
grid_height = rows * (processed_pages[0].height if processed_pages else 600) + (rows - 1) * spacing
|
373
337
|
|
374
|
-
# Create final grid image with modern background
|
375
338
|
final_img = Image.new('RGB', (total_width, grid_height), '#F8FAFC')
|
376
339
|
|
377
|
-
# Place pages in grid
|
378
340
|
for idx, page_img in enumerate(processed_pages):
|
379
341
|
row = idx // cols
|
380
342
|
col = idx % cols
|
@@ -384,13 +346,11 @@ class StructuredPDFParser:
|
|
384
346
|
|
385
347
|
final_img.paste(page_img, (x_pos, y_pos))
|
386
348
|
|
387
|
-
# Create legend
|
388
349
|
legend_x = grid_width + spacing
|
389
350
|
legend_y = 20
|
390
351
|
|
391
352
|
draw_legend = ImageDraw.Draw(final_img)
|
392
353
|
|
393
|
-
# Legend title
|
394
354
|
legend_title = "Element Types"
|
395
355
|
if font:
|
396
356
|
title_bbox = draw_legend.textbbox((0, 0), legend_title, font=font)
|
@@ -400,47 +360,38 @@ class StructuredPDFParser:
|
|
400
360
|
title_width = len(legend_title) * 12
|
401
361
|
title_height = 20
|
402
362
|
|
403
|
-
# Draw legend background
|
404
363
|
legend_bg_height = len(used_labels) * 35 + title_height + 40
|
405
364
|
draw_legend.rectangle([legend_x - 10, legend_y - 10,
|
406
365
|
legend_x + legend_width - 10, legend_y + legend_bg_height],
|
407
366
|
fill='white', outline='#E5E7EB', width=2)
|
408
367
|
|
409
|
-
# Draw legend title
|
410
368
|
draw_legend.text((legend_x + 10, legend_y + 5), legend_title,
|
411
369
|
fill='#1F2937', font=font)
|
412
370
|
|
413
|
-
# Draw legend items - now using dynamic colors for actually detected labels
|
414
371
|
current_y = legend_y + title_height + 20
|
415
372
|
|
416
373
|
for label in sorted(used_labels):
|
417
374
|
color = dynamic_label_colors[label]
|
418
375
|
|
419
|
-
# Draw color square
|
420
376
|
square_size = 20
|
421
377
|
draw_legend.rectangle([legend_x + 10, current_y,
|
422
378
|
legend_x + 10 + square_size, current_y + square_size],
|
423
379
|
fill=color, outline='#6B7280', width=1)
|
424
380
|
|
425
|
-
# Draw label text
|
426
381
|
draw_legend.text((legend_x + 40, current_y + 2), label.title(),
|
427
382
|
fill='#374151', font=small_font)
|
428
383
|
|
429
384
|
current_y += 30
|
430
385
|
|
431
|
-
# Save or display
|
432
386
|
if save_path:
|
433
387
|
final_img.save(save_path, quality=95, optimize=True)
|
434
388
|
print(f"Layout visualization saved to: {save_path}")
|
435
389
|
else:
|
436
|
-
# Display using PIL's default viewer
|
437
390
|
final_img.show()
|
438
391
|
|
439
|
-
# Print summary statistics
|
440
392
|
print(f"\n📊 Layout Detection Summary for {os.path.basename(pdf_path)}:")
|
441
393
|
print(f"Pages processed: {pages_to_show}")
|
442
394
|
|
443
|
-
# Create summary by label across all pages
|
444
395
|
total_counts = {}
|
445
396
|
for idx in range(pages_to_show):
|
446
397
|
page = pages[idx]
|