natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,736 @@
|
|
1
|
+
"""
|
2
|
+
Document layout analysis for natural-pdf.
|
3
|
+
|
4
|
+
This module provides functionality for detecting and analyzing the layout
|
5
|
+
of PDF documents using machine learning models.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import tempfile
|
9
|
+
import importlib.util
|
10
|
+
import logging
|
11
|
+
from typing import Dict, List, Optional, Tuple, Union, Any, Set
|
12
|
+
import numpy as np
|
13
|
+
import torch
|
14
|
+
from PIL import Image
|
15
|
+
|
16
|
+
from huggingface_hub import hf_hub_download
|
17
|
+
from doclayout_yolo import YOLOv10
|
18
|
+
from torchvision import transforms
|
19
|
+
from transformers import AutoModelForObjectDetection
|
20
|
+
|
21
|
+
from natural_pdf.elements.region import Region
|
22
|
+
|
23
|
+
# Set up module logger
|
24
|
+
logger = logging.getLogger("natural_pdf.analyzers.layout")
|
25
|
+
|
26
|
+
|
27
|
+
class LayoutDetector:
|
28
|
+
"""
|
29
|
+
Base class for document layout detection.
|
30
|
+
"""
|
31
|
+
def __init__(self):
|
32
|
+
self.supported_classes: Set[str] = set()
|
33
|
+
|
34
|
+
def detect(self, image_path: str, confidence: float = 0.5,
|
35
|
+
classes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
36
|
+
"""
|
37
|
+
Detect layout elements in an image.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
image_path: Path to the image to analyze
|
41
|
+
confidence: Minimum confidence threshold for detections
|
42
|
+
classes: Specific classes to detect, or None for all supported classes
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
List of detected regions with their properties
|
46
|
+
"""
|
47
|
+
raise NotImplementedError("Subclasses must implement this method")
|
48
|
+
|
49
|
+
def _normalize_class_name(self, name: str) -> str:
|
50
|
+
"""Convert class names with spaces to hyphenated format for selectors."""
|
51
|
+
return name.lower().replace(' ', '-')
|
52
|
+
|
53
|
+
def validate_classes(self, classes: List[str]) -> None:
|
54
|
+
"""
|
55
|
+
Validate that the requested classes are supported by this detector.
|
56
|
+
|
57
|
+
Args:
|
58
|
+
classes: List of class names to validate
|
59
|
+
|
60
|
+
Raises:
|
61
|
+
ValueError: If any class is not supported
|
62
|
+
"""
|
63
|
+
if classes:
|
64
|
+
normalized_supported = {self._normalize_class_name(c) for c in self.supported_classes}
|
65
|
+
unsupported = [c for c in classes if self._normalize_class_name(c) not in normalized_supported]
|
66
|
+
if unsupported:
|
67
|
+
raise ValueError(f"Classes not supported by this detector: {unsupported}. "
|
68
|
+
f"Supported classes: {sorted(self.supported_classes)}")
|
69
|
+
|
70
|
+
|
71
|
+
class YOLODocLayoutDetector(LayoutDetector):
|
72
|
+
"""
|
73
|
+
Document layout detector using YOLO model.
|
74
|
+
"""
|
75
|
+
def __init__(self,
|
76
|
+
model_repo: str = "juliozhao/DocLayout-YOLO-DocStructBench",
|
77
|
+
model_file: str = "doclayout_yolo_docstructbench_imgsz1024.pt",
|
78
|
+
device: str = "cpu"):
|
79
|
+
"""
|
80
|
+
Initialize the YOLO document layout detector.
|
81
|
+
|
82
|
+
Args:
|
83
|
+
model_repo: Hugging Face repository ID for the model
|
84
|
+
model_file: Filename of the model in the repository
|
85
|
+
device: Device to use for inference ('cpu' or 'cuda:0', etc.)
|
86
|
+
"""
|
87
|
+
super().__init__()
|
88
|
+
self.model_repo = model_repo
|
89
|
+
self.model_file = model_file
|
90
|
+
self.device = device
|
91
|
+
self._model = None
|
92
|
+
self._model_path = None
|
93
|
+
|
94
|
+
# DocLayout YOLO classes
|
95
|
+
self.supported_classes = {
|
96
|
+
'title', 'plain text', 'abandon', 'figure', 'figure_caption',
|
97
|
+
'table', 'table_caption', 'table_footnote', 'isolate_formula',
|
98
|
+
'formula_caption'
|
99
|
+
}
|
100
|
+
|
101
|
+
@property
|
102
|
+
def model(self) -> YOLOv10:
|
103
|
+
"""Lazy-load the model when first needed."""
|
104
|
+
if self._model is None:
|
105
|
+
self._model_path = hf_hub_download(repo_id=self.model_repo, filename=self.model_file)
|
106
|
+
self._model = YOLOv10(self._model_path)
|
107
|
+
return self._model
|
108
|
+
|
109
|
+
def detect(self, image_path: str, confidence: float = 0.2,
|
110
|
+
classes: Optional[List[str]] = None,
|
111
|
+
exclude_classes: Optional[List[str]] = None,
|
112
|
+
image_size: int = 1024) -> List[Dict[str, Any]]:
|
113
|
+
"""
|
114
|
+
Detect layout elements in an image using YOLO.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
image_path: Path to the image to analyze
|
118
|
+
confidence: Minimum confidence threshold for detections
|
119
|
+
classes: Specific classes to detect, or None for all supported classes
|
120
|
+
exclude_classes: Classes to exclude from detection
|
121
|
+
image_size: Size to resize the image to before detection
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
List of detected regions with their properties
|
125
|
+
"""
|
126
|
+
# Validate requested classes
|
127
|
+
self.validate_classes(classes or [])
|
128
|
+
|
129
|
+
# Validate excluded classes
|
130
|
+
if exclude_classes:
|
131
|
+
self.validate_classes(exclude_classes)
|
132
|
+
|
133
|
+
# Run model prediction
|
134
|
+
results = self.model.predict(
|
135
|
+
image_path,
|
136
|
+
imgsz=image_size,
|
137
|
+
conf=confidence,
|
138
|
+
device=self.device
|
139
|
+
)
|
140
|
+
|
141
|
+
# Process results into standardized format
|
142
|
+
detections = []
|
143
|
+
for result in results:
|
144
|
+
boxes = result.boxes.xyxy # [x_min, y_min, x_max, y_max]
|
145
|
+
labels = result.boxes.cls
|
146
|
+
scores = result.boxes.conf
|
147
|
+
class_names = result.names
|
148
|
+
|
149
|
+
for box, label, score in zip(boxes, labels, scores):
|
150
|
+
x_min, y_min, x_max, y_max = box.tolist()
|
151
|
+
label_idx = int(label)
|
152
|
+
label_name = class_names[label_idx]
|
153
|
+
|
154
|
+
# Skip if specific classes requested and this isn't one of them
|
155
|
+
if classes and label_name not in classes:
|
156
|
+
continue
|
157
|
+
|
158
|
+
# Skip if this class is in the excluded classes
|
159
|
+
if exclude_classes and label_name in exclude_classes:
|
160
|
+
continue
|
161
|
+
|
162
|
+
detections.append({
|
163
|
+
'bbox': (x_min, y_min, x_max, y_max),
|
164
|
+
'class': label_name,
|
165
|
+
'confidence': float(score),
|
166
|
+
'normalized_class': self._normalize_class_name(label_name)
|
167
|
+
})
|
168
|
+
|
169
|
+
return detections
|
170
|
+
|
171
|
+
|
172
|
+
class TableTransformerDetector(LayoutDetector):
|
173
|
+
"""
|
174
|
+
Table structure detector using Microsoft's Table Transformer (TATR) models.
|
175
|
+
"""
|
176
|
+
|
177
|
+
# Custom resize transform
|
178
|
+
class MaxResize(object):
|
179
|
+
def __init__(self, max_size=800):
|
180
|
+
self.max_size = max_size
|
181
|
+
|
182
|
+
def __call__(self, image):
|
183
|
+
width, height = image.size
|
184
|
+
current_max_size = max(width, height)
|
185
|
+
scale = self.max_size / current_max_size
|
186
|
+
resized_image = image.resize((int(round(scale*width)), int(round(scale*height))))
|
187
|
+
return resized_image
|
188
|
+
|
189
|
+
def __init__(self,
|
190
|
+
detection_model: str = "microsoft/table-transformer-detection",
|
191
|
+
structure_model: str = "microsoft/table-transformer-structure-recognition-v1.1-all",
|
192
|
+
max_detection_size: int = 800,
|
193
|
+
max_structure_size: int = 1000,
|
194
|
+
device: str = None):
|
195
|
+
"""
|
196
|
+
Initialize the Table Transformer detector.
|
197
|
+
|
198
|
+
Args:
|
199
|
+
detection_model: HuggingFace model ID for table detection
|
200
|
+
structure_model: HuggingFace model ID for table structure recognition
|
201
|
+
max_detection_size: Maximum size for detection model input
|
202
|
+
max_structure_size: Maximum size for structure model input
|
203
|
+
device: Device to run inference on (None for auto-detection)
|
204
|
+
"""
|
205
|
+
super().__init__()
|
206
|
+
self.detection_model_id = detection_model
|
207
|
+
self.structure_model_id = structure_model
|
208
|
+
self.max_detection_size = max_detection_size
|
209
|
+
self.max_structure_size = max_structure_size
|
210
|
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
211
|
+
|
212
|
+
# Models will be lazy-loaded
|
213
|
+
self._detection_model = None
|
214
|
+
self._structure_model = None
|
215
|
+
|
216
|
+
# Transforms for detection and structure recognition
|
217
|
+
self.detection_transform = transforms.Compose([
|
218
|
+
self.MaxResize(max_detection_size),
|
219
|
+
transforms.ToTensor(),
|
220
|
+
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
221
|
+
])
|
222
|
+
|
223
|
+
self.structure_transform = transforms.Compose([
|
224
|
+
self.MaxResize(max_structure_size),
|
225
|
+
transforms.ToTensor(),
|
226
|
+
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
227
|
+
])
|
228
|
+
|
229
|
+
# Supported classes
|
230
|
+
self.supported_classes = {
|
231
|
+
'table', 'table row', 'table column', 'table column header'
|
232
|
+
}
|
233
|
+
|
234
|
+
@property
|
235
|
+
def detection_model(self):
|
236
|
+
"""Lazy-load the table detection model."""
|
237
|
+
if self._detection_model is None:
|
238
|
+
self._detection_model = AutoModelForObjectDetection.from_pretrained(
|
239
|
+
self.detection_model_id, revision="no_timm"
|
240
|
+
).to(self.device)
|
241
|
+
return self._detection_model
|
242
|
+
|
243
|
+
@property
|
244
|
+
def structure_model(self):
|
245
|
+
"""Lazy-load the table structure recognition model."""
|
246
|
+
if self._structure_model is None:
|
247
|
+
self._structure_model = AutoModelForObjectDetection.from_pretrained(
|
248
|
+
self.structure_model_id
|
249
|
+
).to(self.device)
|
250
|
+
return self._structure_model
|
251
|
+
|
252
|
+
def box_cxcywh_to_xyxy(self, x):
|
253
|
+
"""Convert bounding box from center-width format to corner format."""
|
254
|
+
x_c, y_c, w, h = x.unbind(-1)
|
255
|
+
b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
|
256
|
+
return torch.stack(b, dim=1)
|
257
|
+
|
258
|
+
def rescale_bboxes(self, out_bbox, size):
|
259
|
+
"""Rescale bounding boxes to image size."""
|
260
|
+
width, height = size
|
261
|
+
boxes = self.box_cxcywh_to_xyxy(out_bbox)
|
262
|
+
boxes = boxes * torch.tensor([width, height, width, height], dtype=torch.float32)
|
263
|
+
return boxes
|
264
|
+
|
265
|
+
def outputs_to_objects(self, outputs, img_size, id2label):
|
266
|
+
"""Convert model outputs to structured objects."""
|
267
|
+
m = outputs.logits.softmax(-1).max(-1)
|
268
|
+
pred_labels = list(m.indices.detach().cpu().numpy())[0]
|
269
|
+
pred_scores = list(m.values.detach().cpu().numpy())[0]
|
270
|
+
pred_bboxes = outputs['pred_boxes'].detach().cpu()[0]
|
271
|
+
pred_bboxes = [elem.tolist() for elem in self.rescale_bboxes(pred_bboxes, img_size)]
|
272
|
+
|
273
|
+
objects = []
|
274
|
+
for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
|
275
|
+
class_label = id2label[int(label)]
|
276
|
+
if not class_label == 'no object':
|
277
|
+
objects.append({
|
278
|
+
'label': class_label,
|
279
|
+
'score': float(score),
|
280
|
+
'bbox': [float(elem) for elem in bbox]
|
281
|
+
})
|
282
|
+
return objects
|
283
|
+
|
284
|
+
def detect(self, image_path: str, confidence: float = 0.5,
|
285
|
+
classes: Optional[List[str]] = None,
|
286
|
+
exclude_classes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
287
|
+
"""
|
288
|
+
Detect tables and their structure in an image.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
image_path: Path to the image to analyze
|
292
|
+
confidence: Minimum confidence threshold for detections
|
293
|
+
classes: Specific classes to detect, or None for all supported classes
|
294
|
+
exclude_classes: Classes to exclude from detection
|
295
|
+
|
296
|
+
Returns:
|
297
|
+
List of detected regions with their properties
|
298
|
+
"""
|
299
|
+
# Validate requested classes
|
300
|
+
self.validate_classes(classes or [])
|
301
|
+
|
302
|
+
# Validate excluded classes
|
303
|
+
if exclude_classes:
|
304
|
+
self.validate_classes(exclude_classes)
|
305
|
+
|
306
|
+
# Load the image
|
307
|
+
image = Image.open(image_path).convert("RGB")
|
308
|
+
|
309
|
+
# Detect tables
|
310
|
+
pixel_values = self.detection_transform(image).unsqueeze(0).to(self.device)
|
311
|
+
with torch.no_grad():
|
312
|
+
outputs = self.detection_model(pixel_values)
|
313
|
+
|
314
|
+
id2label = self.detection_model.config.id2label
|
315
|
+
id2label[len(id2label)] = "no object"
|
316
|
+
tables = self.outputs_to_objects(outputs, image.size, id2label)
|
317
|
+
|
318
|
+
# Filter by confidence
|
319
|
+
tables = [t for t in tables if t['score'] >= confidence]
|
320
|
+
|
321
|
+
# If no tables found, return empty list
|
322
|
+
if not tables:
|
323
|
+
return []
|
324
|
+
|
325
|
+
# Process each table to find its structure
|
326
|
+
all_detections = []
|
327
|
+
|
328
|
+
# Add tables to detections if requested
|
329
|
+
if not classes or 'table' in classes:
|
330
|
+
if not exclude_classes or 'table' not in exclude_classes:
|
331
|
+
for table in tables:
|
332
|
+
all_detections.append({
|
333
|
+
'bbox': tuple(table['bbox']),
|
334
|
+
'class': 'table',
|
335
|
+
'confidence': float(table['score']),
|
336
|
+
'normalized_class': 'table'
|
337
|
+
})
|
338
|
+
|
339
|
+
# Process table structure if needed
|
340
|
+
structure_classes = {'table row', 'table column', 'table column header'}
|
341
|
+
needed_structure = False
|
342
|
+
|
343
|
+
# Check if we need to process structure
|
344
|
+
if not classes:
|
345
|
+
# No classes specified, detect all non-excluded
|
346
|
+
needed_structure = any(c not in (exclude_classes or []) for c in structure_classes)
|
347
|
+
else:
|
348
|
+
# Specific classes requested
|
349
|
+
needed_structure = any(c in classes for c in structure_classes)
|
350
|
+
|
351
|
+
if needed_structure:
|
352
|
+
for table in tables:
|
353
|
+
# Crop the table
|
354
|
+
x_min, y_min, x_max, y_max = table['bbox']
|
355
|
+
cropped_table = image.crop((x_min, y_min, x_max, y_max))
|
356
|
+
|
357
|
+
# Recognize table structure
|
358
|
+
structure_pixel_values = self.structure_transform(cropped_table).unsqueeze(0).to(self.device)
|
359
|
+
with torch.no_grad():
|
360
|
+
structure_outputs = self.structure_model(structure_pixel_values)
|
361
|
+
|
362
|
+
structure_id2label = self.structure_model.config.id2label
|
363
|
+
structure_id2label[len(structure_id2label)] = "no object"
|
364
|
+
|
365
|
+
# Get table structure elements
|
366
|
+
structure_elements = self.outputs_to_objects(structure_outputs, cropped_table.size, structure_id2label)
|
367
|
+
|
368
|
+
# Filter by confidence
|
369
|
+
structure_elements = [e for e in structure_elements if e['score'] >= confidence]
|
370
|
+
|
371
|
+
# Process each structure element
|
372
|
+
for element in structure_elements:
|
373
|
+
element_class = element['label']
|
374
|
+
|
375
|
+
# Skip if specific classes requested and this isn't one of them
|
376
|
+
if classes and element_class not in classes:
|
377
|
+
continue
|
378
|
+
|
379
|
+
# Skip if this class is in the excluded classes
|
380
|
+
if exclude_classes and element_class in exclude_classes:
|
381
|
+
continue
|
382
|
+
|
383
|
+
# Adjust coordinates to the original image (add table's top-left corner)
|
384
|
+
x_min_struct, y_min_struct, x_max_struct, y_max_struct = element['bbox']
|
385
|
+
adjusted_bbox = (
|
386
|
+
x_min_struct + x_min,
|
387
|
+
y_min_struct + y_min,
|
388
|
+
x_max_struct + x_min,
|
389
|
+
y_max_struct + y_min
|
390
|
+
)
|
391
|
+
|
392
|
+
all_detections.append({
|
393
|
+
'bbox': adjusted_bbox,
|
394
|
+
'class': element_class,
|
395
|
+
'confidence': float(element['score']),
|
396
|
+
'normalized_class': self._normalize_class_name(element_class)
|
397
|
+
})
|
398
|
+
|
399
|
+
return all_detections
|
400
|
+
|
401
|
+
|
402
|
+
class PaddleLayoutDetector(LayoutDetector):
|
403
|
+
"""
|
404
|
+
Document layout and table structure detector using PaddlePaddle's PP-Structure.
|
405
|
+
"""
|
406
|
+
def __init__(self,
|
407
|
+
lang: str = "en",
|
408
|
+
use_angle_cls: bool = False,
|
409
|
+
device: str = "cpu",
|
410
|
+
enable_table: bool = True,
|
411
|
+
show_log: bool = False,
|
412
|
+
detect_text: bool = True,
|
413
|
+
verbose: bool = False):
|
414
|
+
"""
|
415
|
+
Initialize the PaddlePaddle layout detector.
|
416
|
+
|
417
|
+
Args:
|
418
|
+
lang: Language code for the detector ('en', 'ch', etc.)
|
419
|
+
use_angle_cls: Whether to use text orientation detection
|
420
|
+
device: Device to run inference on ('cpu' or 'gpu')
|
421
|
+
enable_table: Whether to use PP-Structure table detection
|
422
|
+
show_log: Whether to show PaddleOCR logs
|
423
|
+
detect_text: Whether to use direct text detection in addition to layout
|
424
|
+
verbose: Whether to show detailed detection information
|
425
|
+
"""
|
426
|
+
# Set a module-specific logger
|
427
|
+
self.logger = logging.getLogger("natural_pdf.analyzers.layout.paddle")
|
428
|
+
# Store current level to restore it later
|
429
|
+
self.original_level = self.logger.level
|
430
|
+
# Set to DEBUG if verbose is True
|
431
|
+
if verbose:
|
432
|
+
self.logger.setLevel(logging.DEBUG)
|
433
|
+
super().__init__()
|
434
|
+
self.lang = lang
|
435
|
+
self.use_angle_cls = use_angle_cls
|
436
|
+
self.device = device
|
437
|
+
self.enable_table = enable_table
|
438
|
+
self.show_log = show_log
|
439
|
+
self.detect_text = detect_text
|
440
|
+
self.verbose = verbose
|
441
|
+
self._ppstructure = None
|
442
|
+
|
443
|
+
def __del__(self):
|
444
|
+
# Restore the original logging level
|
445
|
+
self.logger.setLevel(self.original_level)
|
446
|
+
|
447
|
+
# Validate PaddlePaddle availability
|
448
|
+
if not self._is_paddle_available():
|
449
|
+
raise ImportError(
|
450
|
+
"PaddlePaddle and PaddleOCR are required for PaddleLayoutDetector. "
|
451
|
+
"Please install them with: pip install paddlepaddle paddleocr"
|
452
|
+
)
|
453
|
+
|
454
|
+
# Supported classes by PP-Structure
|
455
|
+
self.supported_classes = {
|
456
|
+
'text', 'title', 'figure', 'figure_caption',
|
457
|
+
'table', 'table_caption', 'table_cell', 'table_row', 'table_column',
|
458
|
+
'header', 'footer', 'reference', 'equation'
|
459
|
+
}
|
460
|
+
|
461
|
+
def _is_paddle_available(self) -> bool:
|
462
|
+
"""Check if PaddlePaddle and PaddleOCR are installed."""
|
463
|
+
paddle_spec = importlib.util.find_spec("paddle")
|
464
|
+
paddleocr_spec = importlib.util.find_spec("paddleocr")
|
465
|
+
return paddle_spec is not None and paddleocr_spec is not None
|
466
|
+
|
467
|
+
@property
|
468
|
+
def ppstructure(self):
|
469
|
+
"""Lazy-load the PP-Structure model."""
|
470
|
+
if self._ppstructure is None:
|
471
|
+
# Import here to avoid dependency if not used
|
472
|
+
from paddleocr import PPStructure
|
473
|
+
|
474
|
+
# Initialize PP-Structure with minimal settings
|
475
|
+
# Note: Paddleocr's PPStructure requires minimal parameters to work correctly
|
476
|
+
layout_config = {
|
477
|
+
'show_log': self.show_log,
|
478
|
+
'lang': self.lang
|
479
|
+
}
|
480
|
+
|
481
|
+
# Initialize PP-Structure with enhanced settings
|
482
|
+
self._ppstructure = PPStructure(**layout_config)
|
483
|
+
return self._ppstructure
|
484
|
+
|
485
|
+
def detect(self, image_path: str, confidence: float = 0.5,
|
486
|
+
classes: Optional[List[str]] = None,
|
487
|
+
exclude_classes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
488
|
+
"""
|
489
|
+
Detect layout elements in an image using PaddlePaddle.
|
490
|
+
|
491
|
+
Args:
|
492
|
+
image_path: Path to the image to analyze
|
493
|
+
confidence: Minimum confidence threshold for detections
|
494
|
+
classes: Specific classes to detect, or None for all supported classes
|
495
|
+
exclude_classes: Classes to exclude from detection
|
496
|
+
|
497
|
+
Returns:
|
498
|
+
List of detected regions with their properties
|
499
|
+
"""
|
500
|
+
self.logger.info(f"Starting PaddleLayout detection on {image_path}")
|
501
|
+
self.logger.debug(f"Parameters: confidence={confidence}, classes={classes}, exclude_classes={exclude_classes}, detect_text={self.detect_text}")
|
502
|
+
# Validate requested classes
|
503
|
+
self.validate_classes(classes or [])
|
504
|
+
|
505
|
+
# Validate excluded classes
|
506
|
+
if exclude_classes:
|
507
|
+
self.validate_classes(exclude_classes)
|
508
|
+
|
509
|
+
# Convert classes to lowercase for matching
|
510
|
+
classes_lower = [c.lower() for c in (classes or [])]
|
511
|
+
exclude_classes_lower = [c.lower() for c in (exclude_classes or [])]
|
512
|
+
|
513
|
+
# Process image with PP-Structure
|
514
|
+
try:
|
515
|
+
# Try to run PPStructure on the image directly
|
516
|
+
result = self.ppstructure(image_path)
|
517
|
+
|
518
|
+
# Debug output for troubleshooting
|
519
|
+
self.logger.debug(f"PaddleLayout detected {len(result)} regions")
|
520
|
+
for i, reg in enumerate(result):
|
521
|
+
self.logger.debug(f" Region {i+1}: type={reg.get('type', 'unknown')}, "
|
522
|
+
f"confidence={reg.get('score', 0.0)}, "
|
523
|
+
f"bbox={reg.get('bbox', [])}")
|
524
|
+
except Exception as e:
|
525
|
+
self.logger.error(f"Error in PaddleLayout detection: {e}")
|
526
|
+
return []
|
527
|
+
|
528
|
+
# If no results, return empty list
|
529
|
+
if not result:
|
530
|
+
self.logger.warning("PaddleLayout returned empty results")
|
531
|
+
return []
|
532
|
+
|
533
|
+
# Create detections list with the layout regions
|
534
|
+
detections = []
|
535
|
+
|
536
|
+
# Process standard layout results
|
537
|
+
for region in result:
|
538
|
+
try:
|
539
|
+
region_type = region.get('type', '').lower()
|
540
|
+
|
541
|
+
# Skip if specific classes requested and this isn't one of them
|
542
|
+
if classes and region_type not in classes_lower:
|
543
|
+
continue
|
544
|
+
|
545
|
+
# Skip if this class is in the excluded classes
|
546
|
+
if exclude_classes and region_type in exclude_classes_lower:
|
547
|
+
continue
|
548
|
+
|
549
|
+
# Get confidence score (default to 0.99 if not provided)
|
550
|
+
confidence_score = region.get('score', 0.99)
|
551
|
+
|
552
|
+
# Skip if confidence is below threshold
|
553
|
+
if confidence_score < confidence:
|
554
|
+
continue
|
555
|
+
|
556
|
+
# Get bounding box
|
557
|
+
bbox = region.get('bbox', [0, 0, 0, 0])
|
558
|
+
if len(bbox) < 4:
|
559
|
+
print(f"Invalid bbox format: {bbox}, skipping region")
|
560
|
+
continue
|
561
|
+
|
562
|
+
x_min, y_min, x_max, y_max = bbox[0], bbox[1], bbox[2], bbox[3]
|
563
|
+
|
564
|
+
# Normalize the class name for our system
|
565
|
+
if region_type == 'figure':
|
566
|
+
normalized_type = 'figure'
|
567
|
+
elif region_type in ('text', 'header', 'footer', 'reference'):
|
568
|
+
normalized_type = 'plain-text'
|
569
|
+
elif region_type == 'table':
|
570
|
+
normalized_type = 'table'
|
571
|
+
elif region_type == 'title':
|
572
|
+
normalized_type = 'title'
|
573
|
+
elif region_type == 'equation':
|
574
|
+
normalized_type = 'isolate-formula'
|
575
|
+
else:
|
576
|
+
normalized_type = region_type.replace(' ', '-')
|
577
|
+
|
578
|
+
# Add detection
|
579
|
+
detections.append({
|
580
|
+
'bbox': (x_min, y_min, x_max, y_max),
|
581
|
+
'class': region_type,
|
582
|
+
'confidence': confidence_score,
|
583
|
+
'normalized_class': normalized_type,
|
584
|
+
'source': 'layout',
|
585
|
+
'model': 'paddle'
|
586
|
+
})
|
587
|
+
except Exception as e:
|
588
|
+
self.logger.error(f"Error processing layout region: {e}, region data: {region}")
|
589
|
+
|
590
|
+
# Always add text box regions from the direct OCR if detect_text is enabled
|
591
|
+
if self.detect_text:
|
592
|
+
try:
|
593
|
+
# Import PaddleOCR
|
594
|
+
from paddleocr import PaddleOCR
|
595
|
+
|
596
|
+
# Use PaddleOCR directly for text detection only (no recognition for speed)
|
597
|
+
ocr = PaddleOCR(lang=self.lang, show_log=self.show_log)
|
598
|
+
ocr_result = ocr.ocr(image_path, det=True, rec=False, cls=False)
|
599
|
+
|
600
|
+
# Now add text box regions if available
|
601
|
+
if ocr_result and len(ocr_result) > 0 and len(ocr_result[0]) > 0:
|
602
|
+
text_boxes = ocr_result[0]
|
603
|
+
self.logger.debug(f"Adding {len(text_boxes)} text box regions from OCR detection")
|
604
|
+
|
605
|
+
for text_box in text_boxes:
|
606
|
+
try:
|
607
|
+
# Get box coordinates - these are actually lists of points, not lists of [box, text, confidence]
|
608
|
+
# when using det=True, rec=False
|
609
|
+
points = text_box
|
610
|
+
|
611
|
+
# When using det=True, rec=False, there's no text or confidence
|
612
|
+
# Just the polygon points, so we use default values
|
613
|
+
text = ""
|
614
|
+
text_confidence = 0.95 # High default confidence for detection
|
615
|
+
|
616
|
+
# Skip if confidence is below threshold
|
617
|
+
if text_confidence < confidence:
|
618
|
+
continue
|
619
|
+
|
620
|
+
# Calculate bounding box
|
621
|
+
x_coords = [p[0] for p in points]
|
622
|
+
y_coords = [p[1] for p in points]
|
623
|
+
x0, y0 = min(x_coords), min(y_coords)
|
624
|
+
x1, y1 = max(x_coords), max(y_coords)
|
625
|
+
|
626
|
+
# Add detection with original polygon points
|
627
|
+
detections.append({
|
628
|
+
'bbox': (x0, y0, x1, y1),
|
629
|
+
'class': 'text',
|
630
|
+
'confidence': text_confidence,
|
631
|
+
'normalized_class': 'plain-text',
|
632
|
+
'polygon': points,
|
633
|
+
'text': text,
|
634
|
+
'source': 'ocr',
|
635
|
+
'model': 'paddle'
|
636
|
+
})
|
637
|
+
except Exception as e:
|
638
|
+
self.logger.error(f"Error processing text box: {e}, box data: {text_box}")
|
639
|
+
except Exception as e:
|
640
|
+
self.logger.error(f"Error adding OCR text boxes: {e}")
|
641
|
+
# Continue with standard layout detection only
|
642
|
+
|
643
|
+
# Process table cells if available and not excluded
|
644
|
+
for region in result:
|
645
|
+
region_type = region.get('type', '').lower()
|
646
|
+
|
647
|
+
# Skip if not a table or table handling is disabled
|
648
|
+
if region_type != 'table' or not self.enable_table:
|
649
|
+
continue
|
650
|
+
|
651
|
+
# Get confidence score (default to 0.99 if not provided)
|
652
|
+
confidence_score = region.get('score', 0.99)
|
653
|
+
|
654
|
+
# Get bounding box for coordinate translation
|
655
|
+
bbox = region.get('bbox', [0, 0, 0, 0])
|
656
|
+
x_min, y_min = bbox[0], bbox[1]
|
657
|
+
|
658
|
+
# Process cells if available
|
659
|
+
if 'res' in region and isinstance(region['res'], dict) and 'cells' in region['res']:
|
660
|
+
cells = region['res']['cells']
|
661
|
+
|
662
|
+
# Process cells, rows, and columns if requested
|
663
|
+
process_cells = not classes or 'table_cell' in classes_lower
|
664
|
+
process_cells = process_cells and ('table_cell' not in exclude_classes_lower)
|
665
|
+
|
666
|
+
if process_cells:
|
667
|
+
for cell in cells:
|
668
|
+
# Convert cell coordinates to global coordinates
|
669
|
+
cell_bbox = cell.get('bbox', [0, 0, 0, 0])
|
670
|
+
cell_x_min = cell_bbox[0] + x_min
|
671
|
+
cell_y_min = cell_bbox[1] + y_min
|
672
|
+
cell_x_max = cell_bbox[2] + x_min
|
673
|
+
cell_y_max = cell_bbox[3] + y_min
|
674
|
+
|
675
|
+
# Add cell detection
|
676
|
+
detections.append({
|
677
|
+
'bbox': (cell_x_min, cell_y_min, cell_x_max, cell_y_max),
|
678
|
+
'class': 'table_cell',
|
679
|
+
'confidence': confidence_score * 0.9, # Slightly lower confidence for cells
|
680
|
+
'normalized_class': 'table-cell',
|
681
|
+
'row_idx': cell.get('row_idx', 0),
|
682
|
+
'col_idx': cell.get('col_idx', 0),
|
683
|
+
'source': 'layout'
|
684
|
+
})
|
685
|
+
|
686
|
+
self.logger.info(f"PaddleLayout detection completed with {len(detections)} regions")
|
687
|
+
return detections
|
688
|
+
|
689
|
+
|
690
|
+
def convert_to_regions(page: Any, detections: List[Dict[str, Any]],
|
691
|
+
scale_factor: float = 1.0) -> List[Region]:
|
692
|
+
"""
|
693
|
+
Convert layout detections to Region objects.
|
694
|
+
|
695
|
+
Args:
|
696
|
+
page: Page object to create regions for
|
697
|
+
detections: List of detection dictionaries
|
698
|
+
scale_factor: Factor to scale coordinates from image to PDF space
|
699
|
+
|
700
|
+
Returns:
|
701
|
+
List of Region objects with layout metadata
|
702
|
+
"""
|
703
|
+
conversion_logger = logging.getLogger("natural_pdf.analyzers.layout.convert")
|
704
|
+
conversion_logger.debug(f"Converting {len(detections)} detections to regions with scale {scale_factor}")
|
705
|
+
regions = []
|
706
|
+
|
707
|
+
for det in detections:
|
708
|
+
# Extract detection info
|
709
|
+
x_min, y_min, x_max, y_max = det['bbox']
|
710
|
+
|
711
|
+
# Scale coordinates from image to PDF space
|
712
|
+
if scale_factor != 1.0:
|
713
|
+
x_min *= scale_factor
|
714
|
+
y_min *= scale_factor
|
715
|
+
x_max *= scale_factor
|
716
|
+
y_max *= scale_factor
|
717
|
+
|
718
|
+
# Create region with metadata
|
719
|
+
region = Region(page, (x_min, y_min, x_max, y_max))
|
720
|
+
region.region_type = det['class']
|
721
|
+
region.confidence = det['confidence']
|
722
|
+
region.normalized_type = det['normalized_class']
|
723
|
+
|
724
|
+
# Add source info - important for filtering
|
725
|
+
region.source = det.get('source', 'detected')
|
726
|
+
region.model = det.get('model', 'unknown')
|
727
|
+
|
728
|
+
# Add additional metadata if available
|
729
|
+
for key, value in det.items():
|
730
|
+
if key not in ('bbox', 'class', 'confidence', 'normalized_class', 'source', 'model'):
|
731
|
+
setattr(region, key, value)
|
732
|
+
|
733
|
+
regions.append(region)
|
734
|
+
|
735
|
+
conversion_logger.debug(f"Created {len(regions)} region objects from {len(detections)} detections")
|
736
|
+
return regions
|