doctra 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +4 -0
- doctra/cli/main.py +170 -9
- doctra/cli/utils.py +2 -3
- doctra/engines/image_restoration/__init__.py +10 -0
- doctra/engines/image_restoration/docres_engine.py +561 -0
- doctra/engines/vlm/outlines_types.py +13 -9
- doctra/engines/vlm/service.py +4 -2
- doctra/exporters/excel_writer.py +89 -0
- doctra/parsers/enhanced_pdf_parser.py +374 -0
- doctra/parsers/structured_pdf_parser.py +6 -0
- doctra/parsers/table_chart_extractor.py +6 -0
- doctra/third_party/docres/data/MBD/MBD.py +110 -0
- doctra/third_party/docres/data/MBD/MBD_utils.py +291 -0
- doctra/third_party/docres/data/MBD/infer.py +151 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +95 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +13 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +402 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +151 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +170 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +288 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +59 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +81 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +12 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +282 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +129 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +88 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +29 -0
- doctra/third_party/docres/data/preprocess/crop_merge_image.py +142 -0
- doctra/third_party/docres/inference.py +370 -0
- doctra/third_party/docres/models/restormer_arch.py +308 -0
- doctra/third_party/docres/utils.py +464 -0
- doctra/ui/app.py +8 -14
- doctra/utils/structured_utils.py +5 -2
- doctra/version.py +1 -1
- {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/METADATA +1 -1
- doctra-0.4.1.dist-info/RECORD +67 -0
- doctra-0.3.3.dist-info/RECORD +0 -44
- {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/WHEEL +0 -0
- {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/top_level.txt +0 -0
doctra/exporters/excel_writer.py
CHANGED
@@ -5,6 +5,7 @@ from typing import Dict, Any, List, Set
|
|
5
5
|
import pandas as pd # pip install pandas openpyxl
|
6
6
|
from openpyxl.styles import PatternFill, Font, Alignment
|
7
7
|
from openpyxl.utils import get_column_letter
|
8
|
+
from openpyxl.worksheet.hyperlink import Hyperlink
|
8
9
|
|
9
10
|
_INVALID_SHEET_CHARS = r'[:\\/*?\[\]]' # Excel-invalid characters
|
10
11
|
_MAX_SHEET_LEN = 31
|
@@ -85,6 +86,61 @@ def _autosize_columns(ws, df: pd.DataFrame) -> None:
|
|
85
86
|
ws.column_dimensions[get_column_letter(i)].width = min(max(10, max_len + 2), 60)
|
86
87
|
|
87
88
|
|
89
|
+
def _style_summary_sheet(ws, df: pd.DataFrame, sheet_mapping: dict = None) -> None:
|
90
|
+
"""
|
91
|
+
Apply special styling to the summary sheet with text wrapping for descriptions.
|
92
|
+
Add hyperlinks to table titles that link to their corresponding sheets.
|
93
|
+
|
94
|
+
:param ws: OpenPyXL worksheet object to style
|
95
|
+
:param df: Pandas DataFrame containing the summary data
|
96
|
+
:param sheet_mapping: Dictionary mapping table titles to their sheet names
|
97
|
+
:return: None
|
98
|
+
"""
|
99
|
+
# Style header row
|
100
|
+
_style_header(ws, ncols=df.shape[1])
|
101
|
+
|
102
|
+
# Apply text wrapping to all data cells
|
103
|
+
wrap_alignment = Alignment(wrap_text=True, vertical="top")
|
104
|
+
|
105
|
+
# Apply wrapping to all data rows (skip header row)
|
106
|
+
for row_idx in range(2, len(df) + 2): # Start from row 2 (after header)
|
107
|
+
for col_idx in range(1, df.shape[1] + 1):
|
108
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
109
|
+
cell.alignment = wrap_alignment
|
110
|
+
|
111
|
+
# Add hyperlink to table title column (column A)
|
112
|
+
if col_idx == 1 and sheet_mapping: # Table Title column
|
113
|
+
table_title = cell.value
|
114
|
+
if table_title and table_title in sheet_mapping:
|
115
|
+
sheet_name = sheet_mapping[table_title]
|
116
|
+
|
117
|
+
# Create hyperlink to the sheet using proper Excel format
|
118
|
+
# Escape sheet name if it contains spaces or special characters
|
119
|
+
if ' ' in sheet_name or any(char in sheet_name for char in ['[', ']', '*', '?', ':', '\\', '/']):
|
120
|
+
hyperlink_ref = f"#'{sheet_name}'!A1"
|
121
|
+
else:
|
122
|
+
hyperlink_ref = f"#{sheet_name}!A1"
|
123
|
+
|
124
|
+
# Use Hyperlink class with proper parameters
|
125
|
+
cell.hyperlink = Hyperlink(ref=hyperlink_ref, target=hyperlink_ref)
|
126
|
+
# Style the hyperlink
|
127
|
+
cell.font = Font(color="0000FF", underline="single")
|
128
|
+
|
129
|
+
# Set specific column widths for summary sheet
|
130
|
+
# Table Title column - narrower
|
131
|
+
ws.column_dimensions['A'].width = 30
|
132
|
+
# Description column - wider to accommodate wrapped text
|
133
|
+
ws.column_dimensions['B'].width = 60
|
134
|
+
# Page column - narrow for page numbers
|
135
|
+
ws.column_dimensions['C'].width = 10
|
136
|
+
# Type column - narrow for Table/Chart
|
137
|
+
ws.column_dimensions['D'].width = 12
|
138
|
+
|
139
|
+
# Set row heights to accommodate wrapped text
|
140
|
+
for row_idx in range(2, len(df) + 2):
|
141
|
+
ws.row_dimensions[row_idx].height = 60 # Allow for multiple lines
|
142
|
+
|
143
|
+
|
88
144
|
def _normalize_data(headers: List[str], rows: List[List]) -> tuple[List[str], List[List]]:
|
89
145
|
"""
|
90
146
|
Normalize headers and rows to ensure consistent dimensions.
|
@@ -159,6 +215,31 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
|
|
159
215
|
taken: Set[str] = set()
|
160
216
|
|
161
217
|
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="w") as writer:
|
218
|
+
# Create summary sheet first
|
219
|
+
summary_data = []
|
220
|
+
sheet_mapping = {} # Map table titles to their sheet names
|
221
|
+
|
222
|
+
for item in valid_items:
|
223
|
+
title = item.get("title") or "Untitled"
|
224
|
+
description = item.get("description") or "No description available"
|
225
|
+
page_number = item.get("page", "Unknown")
|
226
|
+
item_type = item.get("type", "Table") # Default to "Table" if not specified
|
227
|
+
|
228
|
+
|
229
|
+
summary_data.append({
|
230
|
+
"Table Title": title,
|
231
|
+
"Description": description,
|
232
|
+
"Page": page_number,
|
233
|
+
"Type": item_type
|
234
|
+
})
|
235
|
+
|
236
|
+
# Create summary sheet first (but without hyperlinks initially)
|
237
|
+
if summary_data:
|
238
|
+
summary_df = pd.DataFrame(summary_data)
|
239
|
+
summary_df.to_excel(writer, sheet_name="Table Summary", index=False)
|
240
|
+
taken.add("Table Summary")
|
241
|
+
|
242
|
+
# Process individual table sheets to build sheet mapping
|
162
243
|
for item in valid_items:
|
163
244
|
try:
|
164
245
|
title = item.get("title") or "Untitled"
|
@@ -166,6 +247,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
|
|
166
247
|
rows = item.get("rows") or []
|
167
248
|
|
168
249
|
sheet_name = _safe_sheet_name(title, taken)
|
250
|
+
|
251
|
+
# Add to sheet mapping for hyperlinks
|
252
|
+
sheet_mapping[title] = sheet_name
|
169
253
|
|
170
254
|
# Normalize data to handle mismatched dimensions
|
171
255
|
normalized_headers, normalized_rows = _normalize_data(headers, rows)
|
@@ -194,4 +278,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
|
|
194
278
|
print(f"Error processing item '{item.get('title', 'Unknown')}': {e}")
|
195
279
|
continue
|
196
280
|
|
281
|
+
# Now add hyperlinks to the summary sheet (after all sheets are created)
|
282
|
+
if summary_data and sheet_mapping:
|
283
|
+
summary_ws = writer.sheets["Table Summary"]
|
284
|
+
_style_summary_sheet(summary_ws, summary_df, sheet_mapping)
|
285
|
+
|
197
286
|
return excel_path
|
@@ -0,0 +1,374 @@
|
|
1
|
+
"""
|
2
|
+
Enhanced PDF Parser with Image Restoration
|
3
|
+
|
4
|
+
This module provides an enhanced PDF parser that combines the structured parsing
|
5
|
+
capabilities with DocRes image restoration for improved document processing.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
import os
|
10
|
+
import sys
|
11
|
+
import numpy as np
|
12
|
+
from typing import List, Dict, Any, Optional, Union
|
13
|
+
from contextlib import ExitStack
|
14
|
+
from PIL import Image
|
15
|
+
from tqdm import tqdm
|
16
|
+
|
17
|
+
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
|
18
|
+
from doctra.engines.image_restoration import DocResEngine
|
19
|
+
from doctra.utils.pdf_io import render_pdf_to_images
|
20
|
+
from doctra.utils.constants import IMAGE_SUBDIRS, EXCLUDE_LABELS
|
21
|
+
from doctra.utils.file_ops import ensure_output_dirs
|
22
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
23
|
+
from doctra.parsers.layout_order import reading_order_key
|
24
|
+
from doctra.utils.ocr_utils import ocr_box_text
|
25
|
+
from doctra.exporters.image_saver import save_box_image
|
26
|
+
from doctra.exporters.markdown_writer import write_markdown
|
27
|
+
from doctra.exporters.html_writer import write_html, write_structured_html
|
28
|
+
from doctra.exporters.excel_writer import write_structured_excel
|
29
|
+
from doctra.utils.structured_utils import to_structured_dict
|
30
|
+
from doctra.exporters.markdown_table import render_markdown_table
|
31
|
+
|
32
|
+
|
33
|
+
class EnhancedPDFParser(StructuredPDFParser):
|
34
|
+
"""
|
35
|
+
Enhanced PDF Parser with Image Restoration capabilities.
|
36
|
+
|
37
|
+
Extends the StructuredPDFParser with DocRes image restoration to improve
|
38
|
+
document quality before processing. This is particularly useful for:
|
39
|
+
- Scanned documents with shadows or distortion
|
40
|
+
- Low-quality PDFs that need enhancement
|
41
|
+
- Documents with perspective issues
|
42
|
+
|
43
|
+
:param use_image_restoration: Whether to apply DocRes image restoration (default: True)
|
44
|
+
:param restoration_task: DocRes task to use ("dewarping", "deshadowing", "appearance", "deblurring", "binarization", "end2end", default: "appearance")
|
45
|
+
:param restoration_device: Device for DocRes processing ("cuda", "cpu", or None for auto-detect, default: None)
|
46
|
+
:param restoration_dpi: DPI for restoration processing (default: 200)
|
47
|
+
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
48
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
49
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
50
|
+
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
51
|
+
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
52
|
+
:param dpi: DPI for PDF rendering (default: 200)
|
53
|
+
:param min_score: Minimum confidence score for layout detection (default: 0.0)
|
54
|
+
:param ocr_lang: OCR language code (default: "eng")
|
55
|
+
:param ocr_psm: Tesseract page segmentation mode (default: 4)
|
56
|
+
:param ocr_oem: Tesseract OCR engine mode (default: 3)
|
57
|
+
:param ocr_extra_config: Additional Tesseract configuration (default: "")
|
58
|
+
:param box_separator: Separator between text boxes in output (default: "\n")
|
59
|
+
"""
|
60
|
+
|
61
|
+
def __init__(
|
62
|
+
self,
|
63
|
+
*,
|
64
|
+
use_image_restoration: bool = True,
|
65
|
+
restoration_task: str = "appearance",
|
66
|
+
restoration_device: Optional[str] = None,
|
67
|
+
restoration_dpi: int = 200,
|
68
|
+
use_vlm: bool = False,
|
69
|
+
vlm_provider: str = "gemini",
|
70
|
+
vlm_model: str | None = None,
|
71
|
+
vlm_api_key: str | None = None,
|
72
|
+
layout_model_name: str = "PP-DocLayout_plus-L",
|
73
|
+
dpi: int = 200,
|
74
|
+
min_score: float = 0.0,
|
75
|
+
ocr_lang: str = "eng",
|
76
|
+
ocr_psm: int = 4,
|
77
|
+
ocr_oem: int = 3,
|
78
|
+
ocr_extra_config: str = "",
|
79
|
+
box_separator: str = "\n",
|
80
|
+
):
|
81
|
+
"""
|
82
|
+
Initialize the Enhanced PDF Parser with image restoration capabilities.
|
83
|
+
"""
|
84
|
+
# Initialize parent class
|
85
|
+
super().__init__(
|
86
|
+
use_vlm=use_vlm,
|
87
|
+
vlm_provider=vlm_provider,
|
88
|
+
vlm_model=vlm_model,
|
89
|
+
vlm_api_key=vlm_api_key,
|
90
|
+
layout_model_name=layout_model_name,
|
91
|
+
dpi=dpi,
|
92
|
+
min_score=min_score,
|
93
|
+
ocr_lang=ocr_lang,
|
94
|
+
ocr_psm=ocr_psm,
|
95
|
+
ocr_oem=ocr_oem,
|
96
|
+
ocr_extra_config=ocr_extra_config,
|
97
|
+
box_separator=box_separator,
|
98
|
+
)
|
99
|
+
|
100
|
+
# Image restoration settings
|
101
|
+
self.use_image_restoration = use_image_restoration
|
102
|
+
self.restoration_task = restoration_task
|
103
|
+
self.restoration_device = restoration_device
|
104
|
+
self.restoration_dpi = restoration_dpi
|
105
|
+
|
106
|
+
# Initialize DocRes engine if needed
|
107
|
+
self.docres_engine = None
|
108
|
+
if self.use_image_restoration:
|
109
|
+
try:
|
110
|
+
self.docres_engine = DocResEngine(
|
111
|
+
device=restoration_device,
|
112
|
+
use_half_precision=True
|
113
|
+
)
|
114
|
+
print(f"✅ DocRes engine initialized with task: {restoration_task}")
|
115
|
+
except Exception as e:
|
116
|
+
print(f"⚠️ DocRes initialization failed: {e}")
|
117
|
+
print(" Continuing without image restoration...")
|
118
|
+
self.use_image_restoration = False
|
119
|
+
self.docres_engine = None
|
120
|
+
|
121
|
+
def parse(self, pdf_path: str, enhanced_output_dir: str = None) -> None:
|
122
|
+
"""
|
123
|
+
Parse a PDF document with optional image restoration.
|
124
|
+
|
125
|
+
:param pdf_path: Path to the input PDF file
|
126
|
+
:param enhanced_output_dir: Directory for enhanced images (if None, uses default)
|
127
|
+
:return: None
|
128
|
+
"""
|
129
|
+
pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
130
|
+
|
131
|
+
# Set up output directories
|
132
|
+
if enhanced_output_dir is None:
|
133
|
+
out_dir = f"outputs/{pdf_filename}/enhanced_parse"
|
134
|
+
else:
|
135
|
+
out_dir = enhanced_output_dir
|
136
|
+
|
137
|
+
os.makedirs(out_dir, exist_ok=True)
|
138
|
+
ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
|
139
|
+
|
140
|
+
# Process PDF pages with optional restoration
|
141
|
+
if self.use_image_restoration and self.docres_engine:
|
142
|
+
print(f"🔄 Processing PDF with image restoration: {os.path.basename(pdf_path)}")
|
143
|
+
enhanced_pages = self._process_pages_with_restoration(pdf_path, out_dir)
|
144
|
+
else:
|
145
|
+
print(f"🔄 Processing PDF without image restoration: {os.path.basename(pdf_path)}")
|
146
|
+
enhanced_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
147
|
+
|
148
|
+
# Run layout detection on enhanced pages
|
149
|
+
print("🔍 Running layout detection on enhanced pages...")
|
150
|
+
pages = self.layout_engine.predict_pdf(
|
151
|
+
pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
|
152
|
+
)
|
153
|
+
|
154
|
+
# Use enhanced pages for processing
|
155
|
+
pil_pages = enhanced_pages
|
156
|
+
|
157
|
+
# Continue with standard parsing logic
|
158
|
+
self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename, pdf_path)
|
159
|
+
|
160
|
+
def _process_pages_with_restoration(self, pdf_path: str, out_dir: str) -> List[Image.Image]:
|
161
|
+
"""
|
162
|
+
Process PDF pages with DocRes image restoration.
|
163
|
+
|
164
|
+
:param pdf_path: Path to the input PDF file
|
165
|
+
:param out_dir: Output directory for enhanced images
|
166
|
+
:return: List of enhanced PIL images
|
167
|
+
"""
|
168
|
+
# Render original pages
|
169
|
+
original_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.restoration_dpi)]
|
170
|
+
|
171
|
+
if not original_pages:
|
172
|
+
print("❌ No pages found in PDF")
|
173
|
+
return []
|
174
|
+
|
175
|
+
# Create progress bar
|
176
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
177
|
+
if is_notebook:
|
178
|
+
progress_bar = create_notebook_friendly_bar(
|
179
|
+
total=len(original_pages),
|
180
|
+
desc=f"🔄 DocRes {self.restoration_task}"
|
181
|
+
)
|
182
|
+
else:
|
183
|
+
progress_bar = create_beautiful_progress_bar(
|
184
|
+
total=len(original_pages),
|
185
|
+
desc=f"🔄 DocRes {self.restoration_task}",
|
186
|
+
leave=True
|
187
|
+
)
|
188
|
+
|
189
|
+
enhanced_pages = []
|
190
|
+
enhanced_dir = os.path.join(out_dir, "enhanced_pages")
|
191
|
+
os.makedirs(enhanced_dir, exist_ok=True)
|
192
|
+
|
193
|
+
try:
|
194
|
+
with progress_bar:
|
195
|
+
for i, page_img in enumerate(original_pages):
|
196
|
+
try:
|
197
|
+
# Convert PIL to numpy array
|
198
|
+
img_array = np.array(page_img)
|
199
|
+
|
200
|
+
# Apply DocRes restoration
|
201
|
+
restored_img, metadata = self.docres_engine.restore_image(
|
202
|
+
img_array,
|
203
|
+
task=self.restoration_task
|
204
|
+
)
|
205
|
+
|
206
|
+
# Convert back to PIL Image
|
207
|
+
enhanced_page = Image.fromarray(restored_img)
|
208
|
+
enhanced_pages.append(enhanced_page)
|
209
|
+
|
210
|
+
# Save enhanced page for reference
|
211
|
+
enhanced_path = os.path.join(enhanced_dir, f"page_{i+1:03d}_enhanced.jpg")
|
212
|
+
enhanced_page.save(enhanced_path, "JPEG", quality=95)
|
213
|
+
|
214
|
+
progress_bar.set_description(f"✅ Page {i+1}/{len(original_pages)} enhanced")
|
215
|
+
progress_bar.update(1)
|
216
|
+
|
217
|
+
except Exception as e:
|
218
|
+
print(f" ⚠️ Page {i+1} restoration failed: {e}, using original")
|
219
|
+
enhanced_pages.append(page_img)
|
220
|
+
progress_bar.set_description(f"⚠️ Page {i+1} failed, using original")
|
221
|
+
progress_bar.update(1)
|
222
|
+
|
223
|
+
finally:
|
224
|
+
if hasattr(progress_bar, 'close'):
|
225
|
+
progress_bar.close()
|
226
|
+
|
227
|
+
print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
|
228
|
+
return enhanced_pages
|
229
|
+
|
230
|
+
def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename, pdf_path):
|
231
|
+
"""
|
232
|
+
Process the parsing logic with enhanced pages.
|
233
|
+
This is extracted from the parent class to allow customization.
|
234
|
+
"""
|
235
|
+
|
236
|
+
fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
|
237
|
+
chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
|
238
|
+
table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
|
239
|
+
|
240
|
+
md_lines: List[str] = ["# Enhanced Document Content\n"]
|
241
|
+
structured_items: List[Dict[str, Any]] = []
|
242
|
+
|
243
|
+
charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
|
244
|
+
tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
|
245
|
+
figures_desc = "Figures (cropped)"
|
246
|
+
|
247
|
+
with ExitStack() as stack:
|
248
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
249
|
+
if is_notebook:
|
250
|
+
charts_bar = stack.enter_context(
|
251
|
+
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
252
|
+
tables_bar = stack.enter_context(
|
253
|
+
create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
|
254
|
+
figures_bar = stack.enter_context(
|
255
|
+
create_notebook_friendly_bar(total=fig_count, desc=figures_desc)) if fig_count else None
|
256
|
+
else:
|
257
|
+
charts_bar = stack.enter_context(
|
258
|
+
create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
|
259
|
+
tables_bar = stack.enter_context(
|
260
|
+
create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
|
261
|
+
figures_bar = stack.enter_context(
|
262
|
+
create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
|
263
|
+
|
264
|
+
for p in pages:
|
265
|
+
page_num = p.page_index
|
266
|
+
page_img: Image.Image = pil_pages[page_num - 1]
|
267
|
+
md_lines.append(f"\n## Page {page_num}\n")
|
268
|
+
|
269
|
+
for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
|
270
|
+
if box.label in EXCLUDE_LABELS:
|
271
|
+
img_path = save_box_image(page_img, box, out_dir, page_num, i, IMAGE_SUBDIRS)
|
272
|
+
abs_img_path = os.path.abspath(img_path)
|
273
|
+
rel = os.path.relpath(abs_img_path, out_dir)
|
274
|
+
|
275
|
+
if box.label == "figure":
|
276
|
+
md_lines.append(f"\n")
|
277
|
+
if figures_bar: figures_bar.update(1)
|
278
|
+
|
279
|
+
elif box.label == "chart":
|
280
|
+
if self.use_vlm and self.vlm:
|
281
|
+
wrote_table = False
|
282
|
+
try:
|
283
|
+
chart = self.vlm.extract_chart(abs_img_path)
|
284
|
+
item = to_structured_dict(chart)
|
285
|
+
if item:
|
286
|
+
# Add page and type information to structured item
|
287
|
+
item["page"] = page_num
|
288
|
+
item["type"] = "Chart"
|
289
|
+
structured_items.append(item)
|
290
|
+
md_lines.append(
|
291
|
+
render_markdown_table(item.get("headers"), item.get("rows"),
|
292
|
+
title=item.get("title"))
|
293
|
+
)
|
294
|
+
wrote_table = True
|
295
|
+
except Exception as e:
|
296
|
+
pass
|
297
|
+
if not wrote_table:
|
298
|
+
md_lines.append(f"\n")
|
299
|
+
else:
|
300
|
+
md_lines.append(f"\n")
|
301
|
+
if charts_bar: charts_bar.update(1)
|
302
|
+
|
303
|
+
elif box.label == "table":
|
304
|
+
if self.use_vlm and self.vlm:
|
305
|
+
wrote_table = False
|
306
|
+
try:
|
307
|
+
table = self.vlm.extract_table(abs_img_path)
|
308
|
+
item = to_structured_dict(table)
|
309
|
+
if item:
|
310
|
+
# Add page and type information to structured item
|
311
|
+
item["page"] = page_num
|
312
|
+
item["type"] = "Table"
|
313
|
+
structured_items.append(item)
|
314
|
+
md_lines.append(
|
315
|
+
render_markdown_table(item.get("headers"), item.get("rows"),
|
316
|
+
title=item.get("title"))
|
317
|
+
)
|
318
|
+
wrote_table = True
|
319
|
+
except Exception as e:
|
320
|
+
pass
|
321
|
+
if not wrote_table:
|
322
|
+
md_lines.append(f"\n")
|
323
|
+
else:
|
324
|
+
md_lines.append(f"\n")
|
325
|
+
if tables_bar: tables_bar.update(1)
|
326
|
+
else:
|
327
|
+
text = ocr_box_text(self.ocr_engine, page_img, box)
|
328
|
+
if text:
|
329
|
+
md_lines.append(text)
|
330
|
+
md_lines.append(self.box_separator if self.box_separator else "")
|
331
|
+
|
332
|
+
md_path = write_markdown(md_lines, out_dir)
|
333
|
+
html_path = write_html(md_lines, out_dir)
|
334
|
+
|
335
|
+
excel_path = None
|
336
|
+
html_structured_path = None
|
337
|
+
if self.use_vlm and structured_items:
|
338
|
+
excel_path = os.path.join(out_dir, "tables.xlsx")
|
339
|
+
write_structured_excel(excel_path, structured_items)
|
340
|
+
html_structured_path = os.path.join(out_dir, "tables.html")
|
341
|
+
write_structured_html(html_structured_path, structured_items)
|
342
|
+
|
343
|
+
print(f"✅ Enhanced parsing completed successfully!")
|
344
|
+
print(f"📁 Output directory: {out_dir}")
|
345
|
+
|
346
|
+
def restore_pdf_only(self, pdf_path: str, output_path: str = None, task: str = None) -> str:
|
347
|
+
"""
|
348
|
+
Apply DocRes restoration to a PDF without parsing.
|
349
|
+
|
350
|
+
:param pdf_path: Path to the input PDF file
|
351
|
+
:param output_path: Path for the enhanced PDF (if None, auto-generates)
|
352
|
+
:param task: DocRes restoration task (if None, uses instance default)
|
353
|
+
:return: Path to the enhanced PDF or None if failed
|
354
|
+
"""
|
355
|
+
if not self.use_image_restoration or not self.docres_engine:
|
356
|
+
raise RuntimeError("Image restoration is not enabled or DocRes engine is not available")
|
357
|
+
|
358
|
+
task = task or self.restoration_task
|
359
|
+
return self.docres_engine.restore_pdf(pdf_path, output_path, task, self.restoration_dpi)
|
360
|
+
|
361
|
+
def get_restoration_info(self) -> Dict[str, Any]:
|
362
|
+
"""
|
363
|
+
Get information about the current restoration configuration.
|
364
|
+
|
365
|
+
:return: Dictionary with restoration settings and status
|
366
|
+
"""
|
367
|
+
return {
|
368
|
+
'enabled': self.use_image_restoration,
|
369
|
+
'task': self.restoration_task,
|
370
|
+
'device': self.restoration_device,
|
371
|
+
'dpi': self.restoration_dpi,
|
372
|
+
'engine_available': self.docres_engine is not None,
|
373
|
+
'supported_tasks': self.docres_engine.get_supported_tasks() if self.docres_engine else []
|
374
|
+
}
|
@@ -163,6 +163,9 @@ class StructuredPDFParser:
|
|
163
163
|
chart = self.vlm.extract_chart(abs_img_path)
|
164
164
|
item = to_structured_dict(chart)
|
165
165
|
if item:
|
166
|
+
# Add page and type information to structured item
|
167
|
+
item["page"] = page_num
|
168
|
+
item["type"] = "Chart"
|
166
169
|
structured_items.append(item)
|
167
170
|
md_lines.append(
|
168
171
|
render_markdown_table(item.get("headers"), item.get("rows"),
|
@@ -184,6 +187,9 @@ class StructuredPDFParser:
|
|
184
187
|
table = self.vlm.extract_table(abs_img_path)
|
185
188
|
item = to_structured_dict(table)
|
186
189
|
if item:
|
190
|
+
# Add page and type information to structured item
|
191
|
+
item["page"] = page_num
|
192
|
+
item["type"] = "Table"
|
187
193
|
structured_items.append(item)
|
188
194
|
md_lines.append(
|
189
195
|
render_markdown_table(item.get("headers"), item.get("rows"),
|
@@ -178,6 +178,9 @@ class ChartTablePDFParser:
|
|
178
178
|
extracted_chart = self.vlm.extract_chart(chart_path)
|
179
179
|
structured_item = to_structured_dict(extracted_chart)
|
180
180
|
if structured_item:
|
181
|
+
# Add page and type information to structured item
|
182
|
+
structured_item["page"] = page_num
|
183
|
+
structured_item["type"] = "Chart"
|
181
184
|
structured_items.append(structured_item)
|
182
185
|
vlm_items.append({
|
183
186
|
"kind": "chart",
|
@@ -221,6 +224,9 @@ class ChartTablePDFParser:
|
|
221
224
|
extracted_table = self.vlm.extract_table(table_path)
|
222
225
|
structured_item = to_structured_dict(extracted_table)
|
223
226
|
if structured_item:
|
227
|
+
# Add page and type information to structured item
|
228
|
+
structured_item["page"] = page_num
|
229
|
+
structured_item["type"] = "Table"
|
224
230
|
structured_items.append(structured_item)
|
225
231
|
vlm_items.append({
|
226
232
|
"kind": "table",
|
@@ -0,0 +1,110 @@
|
|
1
|
+
import cv2
|
2
|
+
import numpy as np
|
3
|
+
import MBD_utils
|
4
|
+
import torch
|
5
|
+
import torch.nn.functional as F
|
6
|
+
|
7
|
+
|
8
|
+
def mask_base_dewarper(image,mask):
|
9
|
+
'''
|
10
|
+
input:
|
11
|
+
image -> ndarray HxWx3 uint8
|
12
|
+
mask -> ndarray HxW uint8
|
13
|
+
return
|
14
|
+
dewarped -> ndarray HxWx3 uint8
|
15
|
+
grid (optional) -> ndarray HxWx2 -1~1
|
16
|
+
'''
|
17
|
+
|
18
|
+
## get contours
|
19
|
+
# _, contours, hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE) ## cv2.__version__ == 3.x
|
20
|
+
contours,hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,method=cv2.CHAIN_APPROX_SIMPLE) ## cv2.__version__ == 4.x
|
21
|
+
|
22
|
+
## get biggest contours and four corners based on Douglas-Peucker algorithm
|
23
|
+
four_corners, maxArea, contour= MBD_utils.DP_algorithm(contours)
|
24
|
+
four_corners = MBD_utils.reorder(four_corners)
|
25
|
+
|
26
|
+
## reserve biggest contours and remove other noisy contours
|
27
|
+
new_mask = np.zeros_like(mask)
|
28
|
+
new_mask = cv2.drawContours(new_mask,[contour],-1,255,cv2.FILLED)
|
29
|
+
|
30
|
+
## obtain middle points
|
31
|
+
# ratios = [0.25,0.5,0.75] # ratios = [0.125,0.25,0.375,0.5,0.625,0.75,0.875]
|
32
|
+
ratios = [0.25,0.5,0.75]
|
33
|
+
# ratios = [0.0625,0.125,0.1875,0.25,0.3125,0.375,0.4475,0.5,0.5625,0.625,0.06875,0.75,0.8125,0.875,0.9375]
|
34
|
+
middle = MBD_utils.findMiddle(corners=four_corners,mask=new_mask,points=ratios)
|
35
|
+
|
36
|
+
## all points
|
37
|
+
source_points = np.concatenate((four_corners,middle),axis=0) ## all_point = four_corners(topleft,topright,bottom)+top+bottom+left+right
|
38
|
+
|
39
|
+
## target points
|
40
|
+
h,w = image.shape[:2]
|
41
|
+
padding = 0
|
42
|
+
target_points = [[padding, padding],[w-padding, padding], [padding, h-padding],[w-padding, h-padding]]
|
43
|
+
for ratio in ratios:
|
44
|
+
target_points.append([int((w-2*padding)*ratio)+padding,padding])
|
45
|
+
for ratio in ratios:
|
46
|
+
target_points.append([int((w-2*padding)*ratio)+padding,h-padding])
|
47
|
+
for ratio in ratios:
|
48
|
+
target_points.append([padding,int((h-2*padding)*ratio)+padding])
|
49
|
+
for ratio in ratios:
|
50
|
+
target_points.append([w-padding,int((h-2*padding)*ratio)+padding])
|
51
|
+
|
52
|
+
## dewarp base on cv2
|
53
|
+
# pts1 = np.float32(source_points)
|
54
|
+
# pts2 = np.float32(target_points)
|
55
|
+
# tps = cv2.createThinPlateSplineShapeTransformer()
|
56
|
+
# matches = []
|
57
|
+
# N = pts1.shape[0]
|
58
|
+
# for i in range(0,N):
|
59
|
+
# matches.append(cv2.DMatch(i,i,0))
|
60
|
+
# pts1 = pts1.reshape(1,-1,2)
|
61
|
+
# pts2 = pts2.reshape(1,-1,2)
|
62
|
+
# tps.estimateTransformation(pts2,pts1,matches)
|
63
|
+
# dewarped = tps.warpImage(image)
|
64
|
+
|
65
|
+
## dewarp base on generated grid
|
66
|
+
source_points = source_points.reshape(-1,2)/np.array([image.shape[:2][::-1]]).reshape(1,2)
|
67
|
+
source_points = torch.from_numpy(source_points).float().cuda()
|
68
|
+
source_points = source_points.unsqueeze(0)
|
69
|
+
source_points = (source_points-0.5)*2
|
70
|
+
target_points = np.asarray(target_points).reshape(-1,2)/np.array([image.shape[:2][::-1]]).reshape(1,2)
|
71
|
+
target_points = torch.from_numpy(target_points).float()
|
72
|
+
target_points = (target_points-0.5)*2
|
73
|
+
|
74
|
+
model = MBD_utils.TPSGridGen(target_height=256,target_width=256,target_control_points=target_points)
|
75
|
+
model = model.cuda()
|
76
|
+
grid = model(source_points).view(-1,256,256,2).permute(0,3,1,2)
|
77
|
+
grid = F.interpolate(grid,(h,w),mode='bilinear').permute(0,2,3,1)
|
78
|
+
dewarped = MBD_utils.torch2cvimg(F.grid_sample(MBD_utils.cvimg2torch(image).cuda(),grid))[0]
|
79
|
+
return dewarped,grid[0].cpu().numpy()
|
80
|
+
|
81
|
+
def mask_base_cropper(image,mask):
|
82
|
+
'''
|
83
|
+
input:
|
84
|
+
image -> ndarray HxWx3 uint8
|
85
|
+
mask -> ndarray HxW uint8
|
86
|
+
return
|
87
|
+
dewarped -> ndarray HxWx3 uint8
|
88
|
+
grid (optional) -> ndarray HxWx2 -1~1
|
89
|
+
'''
|
90
|
+
|
91
|
+
## get contours
|
92
|
+
_, contours, hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE) ## cv2.__version__ == 3.x
|
93
|
+
# contours,hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,method=cv2.CHAIN_APPROX_SIMPLE) ## cv2.__version__ == 4.x
|
94
|
+
|
95
|
+
## get biggest contours and four corners based on Douglas-Peucker algorithm
|
96
|
+
four_corners, maxArea, contour= MBD_utils.DP_algorithm(contours)
|
97
|
+
four_corners = MBD_utils.reorder(four_corners)
|
98
|
+
|
99
|
+
## reserve biggest contours and remove other noisy contours
|
100
|
+
new_mask = np.zeros_like(mask)
|
101
|
+
new_mask = cv2.drawContours(new_mask,[contour],-1,255,cv2.FILLED)
|
102
|
+
|
103
|
+
## 最小外接矩形
|
104
|
+
rect = cv2.minAreaRect(contour) # 得到最小外接矩形的(中心(x,y), (宽,高), 旋转角度)
|
105
|
+
box = cv2.boxPoints(rect) # cv2.boxPoints(rect) for OpenCV 3.x 获取最小外接矩形的4个顶点坐标
|
106
|
+
box = np.int0(box)
|
107
|
+
box = box.reshape((4,1,2))
|
108
|
+
|
109
|
+
|
110
|
+
|