doctra 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. doctra/__init__.py +4 -0
  2. doctra/cli/main.py +170 -9
  3. doctra/cli/utils.py +2 -3
  4. doctra/engines/image_restoration/__init__.py +10 -0
  5. doctra/engines/image_restoration/docres_engine.py +561 -0
  6. doctra/engines/vlm/outlines_types.py +13 -9
  7. doctra/engines/vlm/service.py +4 -2
  8. doctra/exporters/excel_writer.py +89 -0
  9. doctra/parsers/enhanced_pdf_parser.py +374 -0
  10. doctra/parsers/structured_pdf_parser.py +6 -0
  11. doctra/parsers/table_chart_extractor.py +6 -0
  12. doctra/third_party/docres/data/MBD/MBD.py +110 -0
  13. doctra/third_party/docres/data/MBD/MBD_utils.py +291 -0
  14. doctra/third_party/docres/data/MBD/infer.py +151 -0
  15. doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +95 -0
  16. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +13 -0
  17. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +402 -0
  18. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +151 -0
  19. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +170 -0
  20. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +288 -0
  21. doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +59 -0
  22. doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +81 -0
  23. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +12 -0
  24. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +282 -0
  25. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +129 -0
  26. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +88 -0
  27. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +29 -0
  28. doctra/third_party/docres/data/preprocess/crop_merge_image.py +142 -0
  29. doctra/third_party/docres/inference.py +370 -0
  30. doctra/third_party/docres/models/restormer_arch.py +308 -0
  31. doctra/third_party/docres/utils.py +464 -0
  32. doctra/ui/app.py +8 -14
  33. doctra/utils/structured_utils.py +5 -2
  34. doctra/version.py +1 -1
  35. {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/METADATA +1 -1
  36. doctra-0.4.1.dist-info/RECORD +67 -0
  37. doctra-0.3.3.dist-info/RECORD +0 -44
  38. {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/WHEEL +0 -0
  39. {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/licenses/LICENSE +0 -0
  40. {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ from typing import Dict, Any, List, Set
5
5
  import pandas as pd # pip install pandas openpyxl
6
6
  from openpyxl.styles import PatternFill, Font, Alignment
7
7
  from openpyxl.utils import get_column_letter
8
+ from openpyxl.worksheet.hyperlink import Hyperlink
8
9
 
9
10
  _INVALID_SHEET_CHARS = r'[:\\/*?\[\]]' # Excel-invalid characters
10
11
  _MAX_SHEET_LEN = 31
@@ -85,6 +86,61 @@ def _autosize_columns(ws, df: pd.DataFrame) -> None:
85
86
  ws.column_dimensions[get_column_letter(i)].width = min(max(10, max_len + 2), 60)
86
87
 
87
88
 
89
+ def _style_summary_sheet(ws, df: pd.DataFrame, sheet_mapping: dict = None) -> None:
90
+ """
91
+ Apply special styling to the summary sheet with text wrapping for descriptions.
92
+ Add hyperlinks to table titles that link to their corresponding sheets.
93
+
94
+ :param ws: OpenPyXL worksheet object to style
95
+ :param df: Pandas DataFrame containing the summary data
96
+ :param sheet_mapping: Dictionary mapping table titles to their sheet names
97
+ :return: None
98
+ """
99
+ # Style header row
100
+ _style_header(ws, ncols=df.shape[1])
101
+
102
+ # Apply text wrapping to all data cells
103
+ wrap_alignment = Alignment(wrap_text=True, vertical="top")
104
+
105
+ # Apply wrapping to all data rows (skip header row)
106
+ for row_idx in range(2, len(df) + 2): # Start from row 2 (after header)
107
+ for col_idx in range(1, df.shape[1] + 1):
108
+ cell = ws.cell(row=row_idx, column=col_idx)
109
+ cell.alignment = wrap_alignment
110
+
111
+ # Add hyperlink to table title column (column A)
112
+ if col_idx == 1 and sheet_mapping: # Table Title column
113
+ table_title = cell.value
114
+ if table_title and table_title in sheet_mapping:
115
+ sheet_name = sheet_mapping[table_title]
116
+
117
+ # Create hyperlink to the sheet using proper Excel format
118
+ # Escape sheet name if it contains spaces or special characters
119
+ if ' ' in sheet_name or any(char in sheet_name for char in ['[', ']', '*', '?', ':', '\\', '/']):
120
+ hyperlink_ref = f"#'{sheet_name}'!A1"
121
+ else:
122
+ hyperlink_ref = f"#{sheet_name}!A1"
123
+
124
+ # Use Hyperlink class with proper parameters
125
+ cell.hyperlink = Hyperlink(ref=hyperlink_ref, target=hyperlink_ref)
126
+ # Style the hyperlink
127
+ cell.font = Font(color="0000FF", underline="single")
128
+
129
+ # Set specific column widths for summary sheet
130
+ # Table Title column - narrower
131
+ ws.column_dimensions['A'].width = 30
132
+ # Description column - wider to accommodate wrapped text
133
+ ws.column_dimensions['B'].width = 60
134
+ # Page column - narrow for page numbers
135
+ ws.column_dimensions['C'].width = 10
136
+ # Type column - narrow for Table/Chart
137
+ ws.column_dimensions['D'].width = 12
138
+
139
+ # Set row heights to accommodate wrapped text
140
+ for row_idx in range(2, len(df) + 2):
141
+ ws.row_dimensions[row_idx].height = 60 # Allow for multiple lines
142
+
143
+
88
144
  def _normalize_data(headers: List[str], rows: List[List]) -> tuple[List[str], List[List]]:
89
145
  """
90
146
  Normalize headers and rows to ensure consistent dimensions.
@@ -159,6 +215,31 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
159
215
  taken: Set[str] = set()
160
216
 
161
217
  with pd.ExcelWriter(excel_path, engine="openpyxl", mode="w") as writer:
218
+ # Create summary sheet first
219
+ summary_data = []
220
+ sheet_mapping = {} # Map table titles to their sheet names
221
+
222
+ for item in valid_items:
223
+ title = item.get("title") or "Untitled"
224
+ description = item.get("description") or "No description available"
225
+ page_number = item.get("page", "Unknown")
226
+ item_type = item.get("type", "Table") # Default to "Table" if not specified
227
+
228
+
229
+ summary_data.append({
230
+ "Table Title": title,
231
+ "Description": description,
232
+ "Page": page_number,
233
+ "Type": item_type
234
+ })
235
+
236
+ # Create summary sheet first (but without hyperlinks initially)
237
+ if summary_data:
238
+ summary_df = pd.DataFrame(summary_data)
239
+ summary_df.to_excel(writer, sheet_name="Table Summary", index=False)
240
+ taken.add("Table Summary")
241
+
242
+ # Process individual table sheets to build sheet mapping
162
243
  for item in valid_items:
163
244
  try:
164
245
  title = item.get("title") or "Untitled"
@@ -166,6 +247,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
166
247
  rows = item.get("rows") or []
167
248
 
168
249
  sheet_name = _safe_sheet_name(title, taken)
250
+
251
+ # Add to sheet mapping for hyperlinks
252
+ sheet_mapping[title] = sheet_name
169
253
 
170
254
  # Normalize data to handle mismatched dimensions
171
255
  normalized_headers, normalized_rows = _normalize_data(headers, rows)
@@ -194,4 +278,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
194
278
  print(f"Error processing item '{item.get('title', 'Unknown')}': {e}")
195
279
  continue
196
280
 
281
+ # Now add hyperlinks to the summary sheet (after all sheets are created)
282
+ if summary_data and sheet_mapping:
283
+ summary_ws = writer.sheets["Table Summary"]
284
+ _style_summary_sheet(summary_ws, summary_df, sheet_mapping)
285
+
197
286
  return excel_path
@@ -0,0 +1,374 @@
1
+ """
2
+ Enhanced PDF Parser with Image Restoration
3
+
4
+ This module provides an enhanced PDF parser that combines the structured parsing
5
+ capabilities with DocRes image restoration for improved document processing.
6
+ """
7
+
8
+ from __future__ import annotations
9
+ import os
10
+ import sys
11
+ import numpy as np
12
+ from typing import List, Dict, Any, Optional, Union
13
+ from contextlib import ExitStack
14
+ from PIL import Image
15
+ from tqdm import tqdm
16
+
17
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
18
+ from doctra.engines.image_restoration import DocResEngine
19
+ from doctra.utils.pdf_io import render_pdf_to_images
20
+ from doctra.utils.constants import IMAGE_SUBDIRS, EXCLUDE_LABELS
21
+ from doctra.utils.file_ops import ensure_output_dirs
22
+ from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
23
+ from doctra.parsers.layout_order import reading_order_key
24
+ from doctra.utils.ocr_utils import ocr_box_text
25
+ from doctra.exporters.image_saver import save_box_image
26
+ from doctra.exporters.markdown_writer import write_markdown
27
+ from doctra.exporters.html_writer import write_html, write_structured_html
28
+ from doctra.exporters.excel_writer import write_structured_excel
29
+ from doctra.utils.structured_utils import to_structured_dict
30
+ from doctra.exporters.markdown_table import render_markdown_table
31
+
32
+
33
+ class EnhancedPDFParser(StructuredPDFParser):
34
+ """
35
+ Enhanced PDF Parser with Image Restoration capabilities.
36
+
37
+ Extends the StructuredPDFParser with DocRes image restoration to improve
38
+ document quality before processing. This is particularly useful for:
39
+ - Scanned documents with shadows or distortion
40
+ - Low-quality PDFs that need enhancement
41
+ - Documents with perspective issues
42
+
43
+ :param use_image_restoration: Whether to apply DocRes image restoration (default: True)
44
+ :param restoration_task: DocRes task to use ("dewarping", "deshadowing", "appearance", "deblurring", "binarization", "end2end", default: "appearance")
45
+ :param restoration_device: Device for DocRes processing ("cuda", "cpu", or None for auto-detect, default: None)
46
+ :param restoration_dpi: DPI for restoration processing (default: 200)
47
+ :param use_vlm: Whether to use VLM for structured data extraction (default: False)
48
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
49
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
50
+ :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
51
+ :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
52
+ :param dpi: DPI for PDF rendering (default: 200)
53
+ :param min_score: Minimum confidence score for layout detection (default: 0.0)
54
+ :param ocr_lang: OCR language code (default: "eng")
55
+ :param ocr_psm: Tesseract page segmentation mode (default: 4)
56
+ :param ocr_oem: Tesseract OCR engine mode (default: 3)
57
+ :param ocr_extra_config: Additional Tesseract configuration (default: "")
58
+ :param box_separator: Separator between text boxes in output (default: "\n")
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ *,
64
+ use_image_restoration: bool = True,
65
+ restoration_task: str = "appearance",
66
+ restoration_device: Optional[str] = None,
67
+ restoration_dpi: int = 200,
68
+ use_vlm: bool = False,
69
+ vlm_provider: str = "gemini",
70
+ vlm_model: str | None = None,
71
+ vlm_api_key: str | None = None,
72
+ layout_model_name: str = "PP-DocLayout_plus-L",
73
+ dpi: int = 200,
74
+ min_score: float = 0.0,
75
+ ocr_lang: str = "eng",
76
+ ocr_psm: int = 4,
77
+ ocr_oem: int = 3,
78
+ ocr_extra_config: str = "",
79
+ box_separator: str = "\n",
80
+ ):
81
+ """
82
+ Initialize the Enhanced PDF Parser with image restoration capabilities.
83
+ """
84
+ # Initialize parent class
85
+ super().__init__(
86
+ use_vlm=use_vlm,
87
+ vlm_provider=vlm_provider,
88
+ vlm_model=vlm_model,
89
+ vlm_api_key=vlm_api_key,
90
+ layout_model_name=layout_model_name,
91
+ dpi=dpi,
92
+ min_score=min_score,
93
+ ocr_lang=ocr_lang,
94
+ ocr_psm=ocr_psm,
95
+ ocr_oem=ocr_oem,
96
+ ocr_extra_config=ocr_extra_config,
97
+ box_separator=box_separator,
98
+ )
99
+
100
+ # Image restoration settings
101
+ self.use_image_restoration = use_image_restoration
102
+ self.restoration_task = restoration_task
103
+ self.restoration_device = restoration_device
104
+ self.restoration_dpi = restoration_dpi
105
+
106
+ # Initialize DocRes engine if needed
107
+ self.docres_engine = None
108
+ if self.use_image_restoration:
109
+ try:
110
+ self.docres_engine = DocResEngine(
111
+ device=restoration_device,
112
+ use_half_precision=True
113
+ )
114
+ print(f"✅ DocRes engine initialized with task: {restoration_task}")
115
+ except Exception as e:
116
+ print(f"⚠️ DocRes initialization failed: {e}")
117
+ print(" Continuing without image restoration...")
118
+ self.use_image_restoration = False
119
+ self.docres_engine = None
120
+
121
+ def parse(self, pdf_path: str, enhanced_output_dir: str = None) -> None:
122
+ """
123
+ Parse a PDF document with optional image restoration.
124
+
125
+ :param pdf_path: Path to the input PDF file
126
+ :param enhanced_output_dir: Directory for enhanced images (if None, uses default)
127
+ :return: None
128
+ """
129
+ pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
130
+
131
+ # Set up output directories
132
+ if enhanced_output_dir is None:
133
+ out_dir = f"outputs/{pdf_filename}/enhanced_parse"
134
+ else:
135
+ out_dir = enhanced_output_dir
136
+
137
+ os.makedirs(out_dir, exist_ok=True)
138
+ ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
139
+
140
+ # Process PDF pages with optional restoration
141
+ if self.use_image_restoration and self.docres_engine:
142
+ print(f"🔄 Processing PDF with image restoration: {os.path.basename(pdf_path)}")
143
+ enhanced_pages = self._process_pages_with_restoration(pdf_path, out_dir)
144
+ else:
145
+ print(f"🔄 Processing PDF without image restoration: {os.path.basename(pdf_path)}")
146
+ enhanced_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
147
+
148
+ # Run layout detection on enhanced pages
149
+ print("🔍 Running layout detection on enhanced pages...")
150
+ pages = self.layout_engine.predict_pdf(
151
+ pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
152
+ )
153
+
154
+ # Use enhanced pages for processing
155
+ pil_pages = enhanced_pages
156
+
157
+ # Continue with standard parsing logic
158
+ self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename, pdf_path)
159
+
160
+ def _process_pages_with_restoration(self, pdf_path: str, out_dir: str) -> List[Image.Image]:
161
+ """
162
+ Process PDF pages with DocRes image restoration.
163
+
164
+ :param pdf_path: Path to the input PDF file
165
+ :param out_dir: Output directory for enhanced images
166
+ :return: List of enhanced PIL images
167
+ """
168
+ # Render original pages
169
+ original_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.restoration_dpi)]
170
+
171
+ if not original_pages:
172
+ print("❌ No pages found in PDF")
173
+ return []
174
+
175
+ # Create progress bar
176
+ is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
177
+ if is_notebook:
178
+ progress_bar = create_notebook_friendly_bar(
179
+ total=len(original_pages),
180
+ desc=f"🔄 DocRes {self.restoration_task}"
181
+ )
182
+ else:
183
+ progress_bar = create_beautiful_progress_bar(
184
+ total=len(original_pages),
185
+ desc=f"🔄 DocRes {self.restoration_task}",
186
+ leave=True
187
+ )
188
+
189
+ enhanced_pages = []
190
+ enhanced_dir = os.path.join(out_dir, "enhanced_pages")
191
+ os.makedirs(enhanced_dir, exist_ok=True)
192
+
193
+ try:
194
+ with progress_bar:
195
+ for i, page_img in enumerate(original_pages):
196
+ try:
197
+ # Convert PIL to numpy array
198
+ img_array = np.array(page_img)
199
+
200
+ # Apply DocRes restoration
201
+ restored_img, metadata = self.docres_engine.restore_image(
202
+ img_array,
203
+ task=self.restoration_task
204
+ )
205
+
206
+ # Convert back to PIL Image
207
+ enhanced_page = Image.fromarray(restored_img)
208
+ enhanced_pages.append(enhanced_page)
209
+
210
+ # Save enhanced page for reference
211
+ enhanced_path = os.path.join(enhanced_dir, f"page_{i+1:03d}_enhanced.jpg")
212
+ enhanced_page.save(enhanced_path, "JPEG", quality=95)
213
+
214
+ progress_bar.set_description(f"✅ Page {i+1}/{len(original_pages)} enhanced")
215
+ progress_bar.update(1)
216
+
217
+ except Exception as e:
218
+ print(f" ⚠️ Page {i+1} restoration failed: {e}, using original")
219
+ enhanced_pages.append(page_img)
220
+ progress_bar.set_description(f"⚠️ Page {i+1} failed, using original")
221
+ progress_bar.update(1)
222
+
223
+ finally:
224
+ if hasattr(progress_bar, 'close'):
225
+ progress_bar.close()
226
+
227
+ print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
228
+ return enhanced_pages
229
+
230
+ def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename, pdf_path):
231
+ """
232
+ Process the parsing logic with enhanced pages.
233
+ This is extracted from the parent class to allow customization.
234
+ """
235
+
236
+ fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
237
+ chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
238
+ table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
239
+
240
+ md_lines: List[str] = ["# Enhanced Document Content\n"]
241
+ structured_items: List[Dict[str, Any]] = []
242
+
243
+ charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
244
+ tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
245
+ figures_desc = "Figures (cropped)"
246
+
247
+ with ExitStack() as stack:
248
+ is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
249
+ if is_notebook:
250
+ charts_bar = stack.enter_context(
251
+ create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
252
+ tables_bar = stack.enter_context(
253
+ create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
254
+ figures_bar = stack.enter_context(
255
+ create_notebook_friendly_bar(total=fig_count, desc=figures_desc)) if fig_count else None
256
+ else:
257
+ charts_bar = stack.enter_context(
258
+ create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
259
+ tables_bar = stack.enter_context(
260
+ create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
261
+ figures_bar = stack.enter_context(
262
+ create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
263
+
264
+ for p in pages:
265
+ page_num = p.page_index
266
+ page_img: Image.Image = pil_pages[page_num - 1]
267
+ md_lines.append(f"\n## Page {page_num}\n")
268
+
269
+ for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
270
+ if box.label in EXCLUDE_LABELS:
271
+ img_path = save_box_image(page_img, box, out_dir, page_num, i, IMAGE_SUBDIRS)
272
+ abs_img_path = os.path.abspath(img_path)
273
+ rel = os.path.relpath(abs_img_path, out_dir)
274
+
275
+ if box.label == "figure":
276
+ md_lines.append(f"![Figure — page {page_num}]({rel})\n")
277
+ if figures_bar: figures_bar.update(1)
278
+
279
+ elif box.label == "chart":
280
+ if self.use_vlm and self.vlm:
281
+ wrote_table = False
282
+ try:
283
+ chart = self.vlm.extract_chart(abs_img_path)
284
+ item = to_structured_dict(chart)
285
+ if item:
286
+ # Add page and type information to structured item
287
+ item["page"] = page_num
288
+ item["type"] = "Chart"
289
+ structured_items.append(item)
290
+ md_lines.append(
291
+ render_markdown_table(item.get("headers"), item.get("rows"),
292
+ title=item.get("title"))
293
+ )
294
+ wrote_table = True
295
+ except Exception as e:
296
+ pass
297
+ if not wrote_table:
298
+ md_lines.append(f"![Chart — page {page_num}]({rel})\n")
299
+ else:
300
+ md_lines.append(f"![Chart — page {page_num}]({rel})\n")
301
+ if charts_bar: charts_bar.update(1)
302
+
303
+ elif box.label == "table":
304
+ if self.use_vlm and self.vlm:
305
+ wrote_table = False
306
+ try:
307
+ table = self.vlm.extract_table(abs_img_path)
308
+ item = to_structured_dict(table)
309
+ if item:
310
+ # Add page and type information to structured item
311
+ item["page"] = page_num
312
+ item["type"] = "Table"
313
+ structured_items.append(item)
314
+ md_lines.append(
315
+ render_markdown_table(item.get("headers"), item.get("rows"),
316
+ title=item.get("title"))
317
+ )
318
+ wrote_table = True
319
+ except Exception as e:
320
+ pass
321
+ if not wrote_table:
322
+ md_lines.append(f"![Table — page {page_num}]({rel})\n")
323
+ else:
324
+ md_lines.append(f"![Table — page {page_num}]({rel})\n")
325
+ if tables_bar: tables_bar.update(1)
326
+ else:
327
+ text = ocr_box_text(self.ocr_engine, page_img, box)
328
+ if text:
329
+ md_lines.append(text)
330
+ md_lines.append(self.box_separator if self.box_separator else "")
331
+
332
+ md_path = write_markdown(md_lines, out_dir)
333
+ html_path = write_html(md_lines, out_dir)
334
+
335
+ excel_path = None
336
+ html_structured_path = None
337
+ if self.use_vlm and structured_items:
338
+ excel_path = os.path.join(out_dir, "tables.xlsx")
339
+ write_structured_excel(excel_path, structured_items)
340
+ html_structured_path = os.path.join(out_dir, "tables.html")
341
+ write_structured_html(html_structured_path, structured_items)
342
+
343
+ print(f"✅ Enhanced parsing completed successfully!")
344
+ print(f"📁 Output directory: {out_dir}")
345
+
346
+ def restore_pdf_only(self, pdf_path: str, output_path: str = None, task: str = None) -> str:
347
+ """
348
+ Apply DocRes restoration to a PDF without parsing.
349
+
350
+ :param pdf_path: Path to the input PDF file
351
+ :param output_path: Path for the enhanced PDF (if None, auto-generates)
352
+ :param task: DocRes restoration task (if None, uses instance default)
353
+ :return: Path to the enhanced PDF or None if failed
354
+ """
355
+ if not self.use_image_restoration or not self.docres_engine:
356
+ raise RuntimeError("Image restoration is not enabled or DocRes engine is not available")
357
+
358
+ task = task or self.restoration_task
359
+ return self.docres_engine.restore_pdf(pdf_path, output_path, task, self.restoration_dpi)
360
+
361
+ def get_restoration_info(self) -> Dict[str, Any]:
362
+ """
363
+ Get information about the current restoration configuration.
364
+
365
+ :return: Dictionary with restoration settings and status
366
+ """
367
+ return {
368
+ 'enabled': self.use_image_restoration,
369
+ 'task': self.restoration_task,
370
+ 'device': self.restoration_device,
371
+ 'dpi': self.restoration_dpi,
372
+ 'engine_available': self.docres_engine is not None,
373
+ 'supported_tasks': self.docres_engine.get_supported_tasks() if self.docres_engine else []
374
+ }
@@ -163,6 +163,9 @@ class StructuredPDFParser:
163
163
  chart = self.vlm.extract_chart(abs_img_path)
164
164
  item = to_structured_dict(chart)
165
165
  if item:
166
+ # Add page and type information to structured item
167
+ item["page"] = page_num
168
+ item["type"] = "Chart"
166
169
  structured_items.append(item)
167
170
  md_lines.append(
168
171
  render_markdown_table(item.get("headers"), item.get("rows"),
@@ -184,6 +187,9 @@ class StructuredPDFParser:
184
187
  table = self.vlm.extract_table(abs_img_path)
185
188
  item = to_structured_dict(table)
186
189
  if item:
190
+ # Add page and type information to structured item
191
+ item["page"] = page_num
192
+ item["type"] = "Table"
187
193
  structured_items.append(item)
188
194
  md_lines.append(
189
195
  render_markdown_table(item.get("headers"), item.get("rows"),
@@ -178,6 +178,9 @@ class ChartTablePDFParser:
178
178
  extracted_chart = self.vlm.extract_chart(chart_path)
179
179
  structured_item = to_structured_dict(extracted_chart)
180
180
  if structured_item:
181
+ # Add page and type information to structured item
182
+ structured_item["page"] = page_num
183
+ structured_item["type"] = "Chart"
181
184
  structured_items.append(structured_item)
182
185
  vlm_items.append({
183
186
  "kind": "chart",
@@ -221,6 +224,9 @@ class ChartTablePDFParser:
221
224
  extracted_table = self.vlm.extract_table(table_path)
222
225
  structured_item = to_structured_dict(extracted_table)
223
226
  if structured_item:
227
+ # Add page and type information to structured item
228
+ structured_item["page"] = page_num
229
+ structured_item["type"] = "Table"
224
230
  structured_items.append(structured_item)
225
231
  vlm_items.append({
226
232
  "kind": "table",
@@ -0,0 +1,110 @@
1
+ import cv2
2
+ import numpy as np
3
+ import MBD_utils
4
+ import torch
5
+ import torch.nn.functional as F
6
+
7
+
8
+ def mask_base_dewarper(image,mask):
9
+ '''
10
+ input:
11
+ image -> ndarray HxWx3 uint8
12
+ mask -> ndarray HxW uint8
13
+ return
14
+ dewarped -> ndarray HxWx3 uint8
15
+ grid (optional) -> ndarray HxWx2 -1~1
16
+ '''
17
+
18
+ ## get contours
19
+ # _, contours, hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE) ## cv2.__version__ == 3.x
20
+ contours,hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,method=cv2.CHAIN_APPROX_SIMPLE) ## cv2.__version__ == 4.x
21
+
22
+ ## get biggest contours and four corners based on Douglas-Peucker algorithm
23
+ four_corners, maxArea, contour= MBD_utils.DP_algorithm(contours)
24
+ four_corners = MBD_utils.reorder(four_corners)
25
+
26
+ ## reserve biggest contours and remove other noisy contours
27
+ new_mask = np.zeros_like(mask)
28
+ new_mask = cv2.drawContours(new_mask,[contour],-1,255,cv2.FILLED)
29
+
30
+ ## obtain middle points
31
+ # ratios = [0.25,0.5,0.75] # ratios = [0.125,0.25,0.375,0.5,0.625,0.75,0.875]
32
+ ratios = [0.25,0.5,0.75]
33
+ # ratios = [0.0625,0.125,0.1875,0.25,0.3125,0.375,0.4475,0.5,0.5625,0.625,0.06875,0.75,0.8125,0.875,0.9375]
34
+ middle = MBD_utils.findMiddle(corners=four_corners,mask=new_mask,points=ratios)
35
+
36
+ ## all points
37
+ source_points = np.concatenate((four_corners,middle),axis=0) ## all_point = four_corners(topleft,topright,bottom)+top+bottom+left+right
38
+
39
+ ## target points
40
+ h,w = image.shape[:2]
41
+ padding = 0
42
+ target_points = [[padding, padding],[w-padding, padding], [padding, h-padding],[w-padding, h-padding]]
43
+ for ratio in ratios:
44
+ target_points.append([int((w-2*padding)*ratio)+padding,padding])
45
+ for ratio in ratios:
46
+ target_points.append([int((w-2*padding)*ratio)+padding,h-padding])
47
+ for ratio in ratios:
48
+ target_points.append([padding,int((h-2*padding)*ratio)+padding])
49
+ for ratio in ratios:
50
+ target_points.append([w-padding,int((h-2*padding)*ratio)+padding])
51
+
52
+ ## dewarp base on cv2
53
+ # pts1 = np.float32(source_points)
54
+ # pts2 = np.float32(target_points)
55
+ # tps = cv2.createThinPlateSplineShapeTransformer()
56
+ # matches = []
57
+ # N = pts1.shape[0]
58
+ # for i in range(0,N):
59
+ # matches.append(cv2.DMatch(i,i,0))
60
+ # pts1 = pts1.reshape(1,-1,2)
61
+ # pts2 = pts2.reshape(1,-1,2)
62
+ # tps.estimateTransformation(pts2,pts1,matches)
63
+ # dewarped = tps.warpImage(image)
64
+
65
+ ## dewarp base on generated grid
66
+ source_points = source_points.reshape(-1,2)/np.array([image.shape[:2][::-1]]).reshape(1,2)
67
+ source_points = torch.from_numpy(source_points).float().cuda()
68
+ source_points = source_points.unsqueeze(0)
69
+ source_points = (source_points-0.5)*2
70
+ target_points = np.asarray(target_points).reshape(-1,2)/np.array([image.shape[:2][::-1]]).reshape(1,2)
71
+ target_points = torch.from_numpy(target_points).float()
72
+ target_points = (target_points-0.5)*2
73
+
74
+ model = MBD_utils.TPSGridGen(target_height=256,target_width=256,target_control_points=target_points)
75
+ model = model.cuda()
76
+ grid = model(source_points).view(-1,256,256,2).permute(0,3,1,2)
77
+ grid = F.interpolate(grid,(h,w),mode='bilinear').permute(0,2,3,1)
78
+ dewarped = MBD_utils.torch2cvimg(F.grid_sample(MBD_utils.cvimg2torch(image).cuda(),grid))[0]
79
+ return dewarped,grid[0].cpu().numpy()
80
+
81
+ def mask_base_cropper(image,mask):
82
+ '''
83
+ input:
84
+ image -> ndarray HxWx3 uint8
85
+ mask -> ndarray HxW uint8
86
+ return
87
+ dewarped -> ndarray HxWx3 uint8
88
+ grid (optional) -> ndarray HxWx2 -1~1
89
+ '''
90
+
91
+ ## get contours
92
+ _, contours, hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE) ## cv2.__version__ == 3.x
93
+ # contours,hierarchy = cv2.findContours(mask,cv2.RETR_EXTERNAL,method=cv2.CHAIN_APPROX_SIMPLE) ## cv2.__version__ == 4.x
94
+
95
+ ## get biggest contours and four corners based on Douglas-Peucker algorithm
96
+ four_corners, maxArea, contour= MBD_utils.DP_algorithm(contours)
97
+ four_corners = MBD_utils.reorder(four_corners)
98
+
99
+ ## reserve biggest contours and remove other noisy contours
100
+ new_mask = np.zeros_like(mask)
101
+ new_mask = cv2.drawContours(new_mask,[contour],-1,255,cv2.FILLED)
102
+
103
+ ## 最小外接矩形
104
+ rect = cv2.minAreaRect(contour) # 得到最小外接矩形的(中心(x,y), (宽,高), 旋转角度)
105
+ box = cv2.boxPoints(rect) # cv2.boxPoints(rect) for OpenCV 3.x 获取最小外接矩形的4个顶点坐标
106
+ box = np.int0(box)
107
+ box = box.reshape((4,1,2))
108
+
109
+
110
+