doctra 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. doctra/__init__.py +19 -0
  2. doctra/cli/__init__.py +27 -0
  3. doctra/cli/main.py +856 -0
  4. doctra/cli/utils.py +340 -0
  5. doctra/engines/__init__.py +0 -0
  6. doctra/engines/layout/__init__.py +0 -0
  7. doctra/engines/layout/layout_models.py +90 -0
  8. doctra/engines/layout/paddle_layout.py +225 -0
  9. doctra/engines/ocr/__init__.py +4 -0
  10. doctra/engines/ocr/api.py +36 -0
  11. doctra/engines/ocr/path_resolver.py +48 -0
  12. doctra/engines/ocr/pytesseract_engine.py +76 -0
  13. doctra/engines/vlm/__init__.py +0 -0
  14. doctra/engines/vlm/outlines_types.py +31 -0
  15. doctra/engines/vlm/provider.py +58 -0
  16. doctra/engines/vlm/service.py +117 -0
  17. doctra/exporters/__init__.py +0 -0
  18. doctra/exporters/excel_writer.py +197 -0
  19. doctra/exporters/image_saver.py +42 -0
  20. doctra/exporters/markdown_table.py +56 -0
  21. doctra/exporters/markdown_writer.py +29 -0
  22. doctra/parsers/__init__.py +6 -0
  23. doctra/parsers/layout_order.py +16 -0
  24. doctra/parsers/structured_pdf_parser.py +434 -0
  25. doctra/parsers/table_chart_extractor.py +283 -0
  26. doctra/utils/__init__.py +0 -0
  27. doctra/utils/bbox.py +18 -0
  28. doctra/utils/constants.py +8 -0
  29. doctra/utils/file_ops.py +26 -0
  30. doctra/utils/io_utils.py +10 -0
  31. doctra/utils/ocr_utils.py +20 -0
  32. doctra/utils/pdf_io.py +19 -0
  33. doctra/utils/quiet.py +13 -0
  34. doctra/utils/structured_utils.py +49 -0
  35. doctra/version.py +2 -0
  36. doctra-0.1.0.dist-info/METADATA +626 -0
  37. doctra-0.1.0.dist-info/RECORD +40 -0
  38. doctra-0.1.0.dist-info/WHEEL +5 -0
  39. doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
  40. doctra-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+ from typing import Tuple
3
+ from doctra.engines.layout.layout_models import LayoutBox
4
+
5
+ def reading_order_key(b: LayoutBox) -> Tuple[float, float]:
6
+ """
7
+ Generate a sorting key for layout boxes in reading order.
8
+
9
+ Creates a tuple for sorting layout elements in natural reading order:
10
+ top-to-bottom, then left-to-right. This ensures that text and other
11
+ elements are processed in the order they would be read.
12
+
13
+ :param b: LayoutBox object to generate a sorting key for
14
+ :return: Tuple of (y1, x1) coordinates for sorting in reading order
15
+ """
16
+ return (b.y1, b.x1)
@@ -0,0 +1,434 @@
1
+ from __future__ import annotations
2
+ import os
3
+ import re
4
+ from typing import List, Dict, Any
5
+ from contextlib import ExitStack
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ from tqdm import tqdm
8
+ from doctra.utils.pdf_io import render_pdf_to_images
9
+ from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
10
+ from doctra.engines.layout.layout_models import LayoutPage
11
+ from doctra.engines.ocr import PytesseractOCREngine
12
+ from doctra.utils.constants import EXCLUDE_LABELS, IMAGE_SUBDIRS
13
+ from doctra.parsers.layout_order import reading_order_key
14
+ from doctra.utils.ocr_utils import ocr_box_text
15
+ from doctra.exporters.image_saver import save_box_image
16
+ from doctra.utils.file_ops import ensure_output_dirs
17
+ from doctra.engines.vlm.service import VLMStructuredExtractor
18
+ from doctra.exporters.excel_writer import write_structured_excel
19
+ from doctra.utils.structured_utils import to_structured_dict
20
+ from doctra.exporters.markdown_table import render_markdown_table
21
+ from doctra.exporters.markdown_writer import write_markdown
22
+
23
+
24
+ class StructuredPDFParser:
25
+ """
26
+ Comprehensive PDF parser for extracting all types of content.
27
+
28
+ Processes PDF documents to extract text, tables, charts, and figures.
29
+ Supports OCR for text extraction and optional VLM processing for
30
+ converting visual elements into structured data.
31
+
32
+ :param use_vlm: Whether to use VLM for structured data extraction (default: False)
33
+ :param vlm_provider: VLM provider to use ("gemini" or "openai", default: "gemini")
34
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
35
+ :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
36
+ :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
37
+ :param dpi: DPI for PDF rendering (default: 200)
38
+ :param min_score: Minimum confidence score for layout detection (default: 0.0)
39
+ :param ocr_lang: OCR language code (default: "eng")
40
+ :param ocr_psm: Tesseract page segmentation mode (default: 4)
41
+ :param ocr_oem: Tesseract OCR engine mode (default: 3)
42
+ :param ocr_extra_config: Additional Tesseract configuration (default: "")
43
+ :param box_separator: Separator between text boxes in output (default: "\n")
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ *,
49
+ use_vlm: bool = False,
50
+ vlm_provider: str = "gemini",
51
+ vlm_model: str | None = None,
52
+ vlm_api_key: str | None = None,
53
+ layout_model_name: str = "PP-DocLayout_plus-L",
54
+ dpi: int = 200,
55
+ min_score: float = 0.0,
56
+ ocr_lang: str = "eng",
57
+ ocr_psm: int = 4,
58
+ ocr_oem: int = 3,
59
+ ocr_extra_config: str = "",
60
+ box_separator: str = "\n",
61
+ ):
62
+ """
63
+ Initialize the StructuredPDFParser with processing configuration.
64
+
65
+ Sets up the layout detection engine, OCR engine, and optionally
66
+ the VLM service for comprehensive document processing.
67
+
68
+ :param use_vlm: Whether to use VLM for structured data extraction
69
+ :param vlm_provider: VLM provider to use ("gemini" or "openai")
70
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
71
+ :param vlm_api_key: API key for VLM provider
72
+ :param layout_model_name: Layout detection model name
73
+ :param dpi: DPI for PDF rendering
74
+ :param min_score: Minimum confidence score for layout detection
75
+ :param ocr_lang: OCR language code
76
+ :param ocr_psm: Tesseract page segmentation mode
77
+ :param ocr_oem: Tesseract OCR engine mode
78
+ :param ocr_extra_config: Additional Tesseract configuration
79
+ :param box_separator: Separator between text boxes in output
80
+ """
81
+ self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
82
+ self.dpi = dpi
83
+ self.min_score = min_score
84
+ self.ocr_engine = PytesseractOCREngine(
85
+ lang=ocr_lang, psm=ocr_psm, oem=ocr_oem, extra_config=ocr_extra_config
86
+ )
87
+ self.box_separator = box_separator
88
+ self.use_vlm = use_vlm
89
+ self.vlm = None
90
+ if self.use_vlm:
91
+ self.vlm = VLMStructuredExtractor(
92
+ vlm_provider=vlm_provider,
93
+ vlm_model=vlm_model,
94
+ api_key=vlm_api_key,
95
+ )
96
+
97
+ def parse(self, pdf_path: str) -> None:
98
+ """
99
+ Parse a PDF document and extract all content types.
100
+
101
+ Processes the PDF through layout detection, extracts text using OCR,
102
+ saves images for visual elements, and optionally converts charts/tables
103
+ to structured data using VLM.
104
+
105
+ :param pdf_path: Path to the input PDF file
106
+ :return: None
107
+ """
108
+ # Extract filename without extension and create output directory
109
+ pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
110
+ out_dir = f"outputs/{pdf_filename}"
111
+
112
+ os.makedirs(out_dir, exist_ok=True)
113
+ ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
114
+
115
+ pages: List[LayoutPage] = self.layout_engine.predict_pdf(
116
+ pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
117
+ )
118
+ pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
119
+
120
+ # Count for progress bars
121
+ fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
122
+ chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
123
+ table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
124
+
125
+ md_lines: List[str] = ["# Extracted Content\n"]
126
+ structured_items: List[Dict[str, Any]] = []
127
+
128
+ charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
129
+ tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
130
+ figures_desc = "Figures (cropped)"
131
+
132
+ with ExitStack() as stack:
133
+ charts_bar = stack.enter_context(
134
+ tqdm(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
135
+ tables_bar = stack.enter_context(
136
+ tqdm(total=table_count, desc=tables_desc, leave=True)) if table_count else None
137
+ figures_bar = stack.enter_context(
138
+ tqdm(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
139
+
140
+ for p in pages:
141
+ page_num = p.page_index
142
+ page_img: Image.Image = pil_pages[page_num - 1]
143
+ md_lines.append(f"\n## Page {page_num}\n")
144
+
145
+ for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
146
+ if box.label in EXCLUDE_LABELS:
147
+ img_path = save_box_image(page_img, box, out_dir, page_num, i, IMAGE_SUBDIRS)
148
+ abs_img_path = os.path.abspath(img_path)
149
+ rel = os.path.relpath(abs_img_path, out_dir)
150
+
151
+ if box.label == "figure":
152
+ # Figures are always images in MD
153
+ md_lines.append(f"![Figure — page {page_num}]({rel})\n")
154
+ if figures_bar: figures_bar.update(1)
155
+
156
+ elif box.label == "chart":
157
+ if self.use_vlm and self.vlm:
158
+ # Try structured → Markdown table; fallback to image if it fails
159
+ wrote_table = False
160
+ try:
161
+ chart = self.vlm.extract_chart(abs_img_path)
162
+ item = to_structured_dict(chart)
163
+ if item:
164
+ structured_items.append(item)
165
+ md_lines.append(
166
+ render_markdown_table(item.get("headers"), item.get("rows"),
167
+ title=item.get("title"))
168
+ )
169
+ wrote_table = True
170
+ except Exception:
171
+ pass
172
+ if not wrote_table:
173
+ md_lines.append(f"![Chart — page {page_num}]({rel})\n")
174
+ else:
175
+ md_lines.append(f"![Chart — page {page_num}]({rel})\n")
176
+ if charts_bar: charts_bar.update(1)
177
+
178
+ elif box.label == "table":
179
+ if self.use_vlm and self.vlm:
180
+ # Try structured → Markdown table; fallback to image if it fails
181
+ wrote_table = False
182
+ try:
183
+ table = self.vlm.extract_table(abs_img_path)
184
+ item = to_structured_dict(table)
185
+ if item:
186
+ structured_items.append(item)
187
+ md_lines.append(
188
+ render_markdown_table(item.get("headers"), item.get("rows"),
189
+ title=item.get("title"))
190
+ )
191
+ wrote_table = True
192
+ except Exception:
193
+ pass
194
+ if not wrote_table:
195
+ md_lines.append(f"![Table — page {page_num}]({rel})\n")
196
+ else:
197
+ md_lines.append(f"![Table — page {page_num}]({rel})\n")
198
+ if tables_bar: tables_bar.update(1)
199
+ else:
200
+ text = ocr_box_text(self.ocr_engine, page_img, box)
201
+ if text:
202
+ md_lines.append(text)
203
+ md_lines.append(self.box_separator if self.box_separator else "")
204
+
205
+ md_path = write_markdown(md_lines, out_dir)
206
+ excel_path = None
207
+ if self.use_vlm and structured_items:
208
+ excel_path = os.path.join(out_dir, "tables.xlsx")
209
+ write_structured_excel(excel_path, structured_items)
210
+
211
+ if excel_path:
212
+ print(f"Parsing completed successfully.\n- Markdown: {md_path}\n- Excel: {excel_path}")
213
+ else:
214
+ print(f"Parsing completed successfully.\n- Markdown: {md_path}")
215
+
216
+ def display_pages_with_boxes(self, pdf_path: str, num_pages: int = 3, cols: int = 2,
217
+ page_width: int = 800, spacing: int = 40, save_path: str = None) -> None:
218
+ """
219
+ Display the first N pages of a PDF with bounding boxes and labels overlaid in a modern grid layout.
220
+
221
+ Creates a visualization showing layout detection results with bounding boxes,
222
+ labels, and confidence scores overlaid on the PDF pages in a grid format.
223
+
224
+ :param pdf_path: Path to the input PDF file
225
+ :param num_pages: Number of pages to display (default: 3)
226
+ :param cols: Number of columns in the grid layout (default: 2)
227
+ :param page_width: Width to resize each page to in pixels (default: 800)
228
+ :param spacing: Spacing between pages in pixels (default: 40)
229
+ :param save_path: Optional path to save the visualization (if None, displays only)
230
+ :return: None
231
+ """
232
+ # Get layout predictions
233
+ pages: List[LayoutPage] = self.layout_engine.predict_pdf(
234
+ pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
235
+ )
236
+ pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
237
+
238
+ # Limit to requested number of pages
239
+ pages_to_show = min(num_pages, len(pages))
240
+
241
+ if pages_to_show == 0:
242
+ print("No pages to display")
243
+ return
244
+
245
+ # Calculate grid dimensions
246
+ rows = (pages_to_show + cols - 1) // cols
247
+
248
+ # Collect unique labels from the processed pages and assign colors
249
+ used_labels = set()
250
+ for idx in range(pages_to_show):
251
+ page = pages[idx]
252
+ for box in page.boxes:
253
+ used_labels.add(box.label.lower())
254
+
255
+ # Create dynamic color assignment for all detected labels
256
+ base_colors = ['#3B82F6', '#EF4444', '#10B981', '#F59E0B', '#8B5CF6',
257
+ '#F97316', '#EC4899', '#6B7280', '#84CC16', '#06B6D4',
258
+ '#DC2626', '#059669', '#7C3AED', '#DB2777', '#0891B2']
259
+
260
+ dynamic_label_colors = {}
261
+ for i, label in enumerate(sorted(used_labels)):
262
+ dynamic_label_colors[label] = base_colors[i % len(base_colors)]
263
+
264
+ # Process each page and add bounding boxes
265
+ processed_pages = []
266
+
267
+ for idx in range(pages_to_show):
268
+ page = pages[idx]
269
+ page_img = pil_pages[idx].copy()
270
+
271
+ # Calculate scale factor to resize to target width
272
+ scale_factor = page_width / page_img.width
273
+ new_height = int(page_img.height * scale_factor)
274
+ page_img = page_img.resize((page_width, new_height), Image.LANCZOS)
275
+
276
+ # Create drawing context
277
+ draw = ImageDraw.Draw(page_img)
278
+
279
+ # Try to load a nice font, fallback to default
280
+ try:
281
+ font = ImageFont.truetype("arial.ttf", 24)
282
+ small_font = ImageFont.truetype("arial.ttf", 18)
283
+ except:
284
+ try:
285
+ font = ImageFont.load_default()
286
+ small_font = ImageFont.load_default()
287
+ except:
288
+ font = None
289
+ small_font = None
290
+
291
+ # Draw bounding boxes
292
+ for box in page.boxes:
293
+ # Scale coordinates
294
+ x1 = int(box.x1 * scale_factor)
295
+ y1 = int(box.y1 * scale_factor)
296
+ x2 = int(box.x2 * scale_factor)
297
+ y2 = int(box.y2 * scale_factor)
298
+
299
+ # Get color for this label from dynamic assignment
300
+ color = dynamic_label_colors.get(box.label.lower(), '#000000')
301
+
302
+ # Draw rectangle with rounded corners effect
303
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
304
+
305
+ # Draw label background
306
+ label_text = f"{box.label} ({box.score:.2f})"
307
+ if font:
308
+ bbox = draw.textbbox((0, 0), label_text, font=small_font)
309
+ text_width = bbox[2] - bbox[0]
310
+ text_height = bbox[3] - bbox[1]
311
+ else:
312
+ text_width = len(label_text) * 8
313
+ text_height = 15
314
+
315
+ # Position label above the box
316
+ label_x = x1
317
+ label_y = max(0, y1 - text_height - 8)
318
+
319
+ # Draw label background with padding
320
+ padding = 4
321
+ draw.rectangle([
322
+ label_x - padding,
323
+ label_y - padding,
324
+ label_x + text_width + padding,
325
+ label_y + text_height + padding
326
+ ], fill='white', outline=color, width=2)
327
+
328
+ # Draw label text
329
+ draw.text((label_x, label_y), label_text, fill=color, font=small_font)
330
+
331
+ # Add page title
332
+ title_text = f"Page {page.page_index} ({len(page.boxes)} boxes)"
333
+ if font:
334
+ title_bbox = draw.textbbox((0, 0), title_text, font=font)
335
+ title_width = title_bbox[2] - title_bbox[0]
336
+ else:
337
+ title_width = len(title_text) * 12
338
+
339
+ # Draw title background
340
+ title_x = (page_width - title_width) // 2
341
+ title_y = 10
342
+ draw.rectangle([title_x - 10, title_y - 5, title_x + title_width + 10, title_y + 35],
343
+ fill='white', outline='#1F2937', width=2)
344
+ draw.text((title_x, title_y), title_text, fill='#1F2937', font=font)
345
+
346
+ processed_pages.append(page_img)
347
+
348
+ # Create grid layout with space for legend
349
+ legend_width = 250
350
+ grid_width = cols * page_width + (cols - 1) * spacing
351
+ total_width = grid_width + legend_width + spacing
352
+ grid_height = rows * (processed_pages[0].height if processed_pages else 600) + (rows - 1) * spacing
353
+
354
+ # Create final grid image with modern background
355
+ final_img = Image.new('RGB', (total_width, grid_height), '#F8FAFC')
356
+
357
+ # Place pages in grid
358
+ for idx, page_img in enumerate(processed_pages):
359
+ row = idx // cols
360
+ col = idx % cols
361
+
362
+ x_pos = col * (page_width + spacing)
363
+ y_pos = row * (page_img.height + spacing)
364
+
365
+ final_img.paste(page_img, (x_pos, y_pos))
366
+
367
+ # Create legend
368
+ legend_x = grid_width + spacing
369
+ legend_y = 20
370
+
371
+ draw_legend = ImageDraw.Draw(final_img)
372
+
373
+ # Legend title
374
+ legend_title = "Element Types"
375
+ if font:
376
+ title_bbox = draw_legend.textbbox((0, 0), legend_title, font=font)
377
+ title_width = title_bbox[2] - title_bbox[0]
378
+ title_height = title_bbox[3] - title_bbox[1]
379
+ else:
380
+ title_width = len(legend_title) * 12
381
+ title_height = 20
382
+
383
+ # Draw legend background
384
+ legend_bg_height = len(used_labels) * 35 + title_height + 40
385
+ draw_legend.rectangle([legend_x - 10, legend_y - 10,
386
+ legend_x + legend_width - 10, legend_y + legend_bg_height],
387
+ fill='white', outline='#E5E7EB', width=2)
388
+
389
+ # Draw legend title
390
+ draw_legend.text((legend_x + 10, legend_y + 5), legend_title,
391
+ fill='#1F2937', font=font)
392
+
393
+ # Draw legend items - now using dynamic colors for actually detected labels
394
+ current_y = legend_y + title_height + 20
395
+
396
+ for label in sorted(used_labels):
397
+ color = dynamic_label_colors[label]
398
+
399
+ # Draw color square
400
+ square_size = 20
401
+ draw_legend.rectangle([legend_x + 10, current_y,
402
+ legend_x + 10 + square_size, current_y + square_size],
403
+ fill=color, outline='#6B7280', width=1)
404
+
405
+ # Draw label text
406
+ draw_legend.text((legend_x + 40, current_y + 2), label.title(),
407
+ fill='#374151', font=small_font)
408
+
409
+ current_y += 30
410
+
411
+ # Save or display
412
+ if save_path:
413
+ final_img.save(save_path, quality=95, optimize=True)
414
+ print(f"Layout visualization saved to: {save_path}")
415
+ else:
416
+ # Display using PIL's default viewer
417
+ final_img.show()
418
+
419
+ # Print summary statistics
420
+ print(f"\n📊 Layout Detection Summary for {os.path.basename(pdf_path)}:")
421
+ print(f"Pages processed: {pages_to_show}")
422
+
423
+ # Create summary by label across all pages
424
+ total_counts = {}
425
+ for idx in range(pages_to_show):
426
+ page = pages[idx]
427
+ for box in page.boxes:
428
+ total_counts[box.label] = total_counts.get(box.label, 0) + 1
429
+
430
+ print("\nTotal elements detected:")
431
+ for label, count in sorted(total_counts.items()):
432
+ print(f" - {label}: {count}")
433
+
434
+ return final_img