natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,736 @@
1
+ """
2
+ Document layout analysis for natural-pdf.
3
+
4
+ This module provides functionality for detecting and analyzing the layout
5
+ of PDF documents using machine learning models.
6
+ """
7
+ import os
8
+ import tempfile
9
+ import importlib.util
10
+ import logging
11
+ from typing import Dict, List, Optional, Tuple, Union, Any, Set
12
+ import numpy as np
13
+ import torch
14
+ from PIL import Image
15
+
16
+ from huggingface_hub import hf_hub_download
17
+ from doclayout_yolo import YOLOv10
18
+ from torchvision import transforms
19
+ from transformers import AutoModelForObjectDetection
20
+
21
+ from natural_pdf.elements.region import Region
22
+
23
+ # Set up module logger
24
+ logger = logging.getLogger("natural_pdf.analyzers.layout")
25
+
26
+
27
+ class LayoutDetector:
28
+ """
29
+ Base class for document layout detection.
30
+ """
31
+ def __init__(self):
32
+ self.supported_classes: Set[str] = set()
33
+
34
+ def detect(self, image_path: str, confidence: float = 0.5,
35
+ classes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
36
+ """
37
+ Detect layout elements in an image.
38
+
39
+ Args:
40
+ image_path: Path to the image to analyze
41
+ confidence: Minimum confidence threshold for detections
42
+ classes: Specific classes to detect, or None for all supported classes
43
+
44
+ Returns:
45
+ List of detected regions with their properties
46
+ """
47
+ raise NotImplementedError("Subclasses must implement this method")
48
+
49
+ def _normalize_class_name(self, name: str) -> str:
50
+ """Convert class names with spaces to hyphenated format for selectors."""
51
+ return name.lower().replace(' ', '-')
52
+
53
+ def validate_classes(self, classes: List[str]) -> None:
54
+ """
55
+ Validate that the requested classes are supported by this detector.
56
+
57
+ Args:
58
+ classes: List of class names to validate
59
+
60
+ Raises:
61
+ ValueError: If any class is not supported
62
+ """
63
+ if classes:
64
+ normalized_supported = {self._normalize_class_name(c) for c in self.supported_classes}
65
+ unsupported = [c for c in classes if self._normalize_class_name(c) not in normalized_supported]
66
+ if unsupported:
67
+ raise ValueError(f"Classes not supported by this detector: {unsupported}. "
68
+ f"Supported classes: {sorted(self.supported_classes)}")
69
+
70
+
71
+ class YOLODocLayoutDetector(LayoutDetector):
72
+ """
73
+ Document layout detector using YOLO model.
74
+ """
75
+ def __init__(self,
76
+ model_repo: str = "juliozhao/DocLayout-YOLO-DocStructBench",
77
+ model_file: str = "doclayout_yolo_docstructbench_imgsz1024.pt",
78
+ device: str = "cpu"):
79
+ """
80
+ Initialize the YOLO document layout detector.
81
+
82
+ Args:
83
+ model_repo: Hugging Face repository ID for the model
84
+ model_file: Filename of the model in the repository
85
+ device: Device to use for inference ('cpu' or 'cuda:0', etc.)
86
+ """
87
+ super().__init__()
88
+ self.model_repo = model_repo
89
+ self.model_file = model_file
90
+ self.device = device
91
+ self._model = None
92
+ self._model_path = None
93
+
94
+ # DocLayout YOLO classes
95
+ self.supported_classes = {
96
+ 'title', 'plain text', 'abandon', 'figure', 'figure_caption',
97
+ 'table', 'table_caption', 'table_footnote', 'isolate_formula',
98
+ 'formula_caption'
99
+ }
100
+
101
+ @property
102
+ def model(self) -> YOLOv10:
103
+ """Lazy-load the model when first needed."""
104
+ if self._model is None:
105
+ self._model_path = hf_hub_download(repo_id=self.model_repo, filename=self.model_file)
106
+ self._model = YOLOv10(self._model_path)
107
+ return self._model
108
+
109
+ def detect(self, image_path: str, confidence: float = 0.2,
110
+ classes: Optional[List[str]] = None,
111
+ exclude_classes: Optional[List[str]] = None,
112
+ image_size: int = 1024) -> List[Dict[str, Any]]:
113
+ """
114
+ Detect layout elements in an image using YOLO.
115
+
116
+ Args:
117
+ image_path: Path to the image to analyze
118
+ confidence: Minimum confidence threshold for detections
119
+ classes: Specific classes to detect, or None for all supported classes
120
+ exclude_classes: Classes to exclude from detection
121
+ image_size: Size to resize the image to before detection
122
+
123
+ Returns:
124
+ List of detected regions with their properties
125
+ """
126
+ # Validate requested classes
127
+ self.validate_classes(classes or [])
128
+
129
+ # Validate excluded classes
130
+ if exclude_classes:
131
+ self.validate_classes(exclude_classes)
132
+
133
+ # Run model prediction
134
+ results = self.model.predict(
135
+ image_path,
136
+ imgsz=image_size,
137
+ conf=confidence,
138
+ device=self.device
139
+ )
140
+
141
+ # Process results into standardized format
142
+ detections = []
143
+ for result in results:
144
+ boxes = result.boxes.xyxy # [x_min, y_min, x_max, y_max]
145
+ labels = result.boxes.cls
146
+ scores = result.boxes.conf
147
+ class_names = result.names
148
+
149
+ for box, label, score in zip(boxes, labels, scores):
150
+ x_min, y_min, x_max, y_max = box.tolist()
151
+ label_idx = int(label)
152
+ label_name = class_names[label_idx]
153
+
154
+ # Skip if specific classes requested and this isn't one of them
155
+ if classes and label_name not in classes:
156
+ continue
157
+
158
+ # Skip if this class is in the excluded classes
159
+ if exclude_classes and label_name in exclude_classes:
160
+ continue
161
+
162
+ detections.append({
163
+ 'bbox': (x_min, y_min, x_max, y_max),
164
+ 'class': label_name,
165
+ 'confidence': float(score),
166
+ 'normalized_class': self._normalize_class_name(label_name)
167
+ })
168
+
169
+ return detections
170
+
171
+
172
+ class TableTransformerDetector(LayoutDetector):
173
+ """
174
+ Table structure detector using Microsoft's Table Transformer (TATR) models.
175
+ """
176
+
177
+ # Custom resize transform
178
+ class MaxResize(object):
179
+ def __init__(self, max_size=800):
180
+ self.max_size = max_size
181
+
182
+ def __call__(self, image):
183
+ width, height = image.size
184
+ current_max_size = max(width, height)
185
+ scale = self.max_size / current_max_size
186
+ resized_image = image.resize((int(round(scale*width)), int(round(scale*height))))
187
+ return resized_image
188
+
189
+ def __init__(self,
190
+ detection_model: str = "microsoft/table-transformer-detection",
191
+ structure_model: str = "microsoft/table-transformer-structure-recognition-v1.1-all",
192
+ max_detection_size: int = 800,
193
+ max_structure_size: int = 1000,
194
+ device: str = None):
195
+ """
196
+ Initialize the Table Transformer detector.
197
+
198
+ Args:
199
+ detection_model: HuggingFace model ID for table detection
200
+ structure_model: HuggingFace model ID for table structure recognition
201
+ max_detection_size: Maximum size for detection model input
202
+ max_structure_size: Maximum size for structure model input
203
+ device: Device to run inference on (None for auto-detection)
204
+ """
205
+ super().__init__()
206
+ self.detection_model_id = detection_model
207
+ self.structure_model_id = structure_model
208
+ self.max_detection_size = max_detection_size
209
+ self.max_structure_size = max_structure_size
210
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
211
+
212
+ # Models will be lazy-loaded
213
+ self._detection_model = None
214
+ self._structure_model = None
215
+
216
+ # Transforms for detection and structure recognition
217
+ self.detection_transform = transforms.Compose([
218
+ self.MaxResize(max_detection_size),
219
+ transforms.ToTensor(),
220
+ transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
221
+ ])
222
+
223
+ self.structure_transform = transforms.Compose([
224
+ self.MaxResize(max_structure_size),
225
+ transforms.ToTensor(),
226
+ transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
227
+ ])
228
+
229
+ # Supported classes
230
+ self.supported_classes = {
231
+ 'table', 'table row', 'table column', 'table column header'
232
+ }
233
+
234
+ @property
235
+ def detection_model(self):
236
+ """Lazy-load the table detection model."""
237
+ if self._detection_model is None:
238
+ self._detection_model = AutoModelForObjectDetection.from_pretrained(
239
+ self.detection_model_id, revision="no_timm"
240
+ ).to(self.device)
241
+ return self._detection_model
242
+
243
+ @property
244
+ def structure_model(self):
245
+ """Lazy-load the table structure recognition model."""
246
+ if self._structure_model is None:
247
+ self._structure_model = AutoModelForObjectDetection.from_pretrained(
248
+ self.structure_model_id
249
+ ).to(self.device)
250
+ return self._structure_model
251
+
252
+ def box_cxcywh_to_xyxy(self, x):
253
+ """Convert bounding box from center-width format to corner format."""
254
+ x_c, y_c, w, h = x.unbind(-1)
255
+ b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
256
+ return torch.stack(b, dim=1)
257
+
258
+ def rescale_bboxes(self, out_bbox, size):
259
+ """Rescale bounding boxes to image size."""
260
+ width, height = size
261
+ boxes = self.box_cxcywh_to_xyxy(out_bbox)
262
+ boxes = boxes * torch.tensor([width, height, width, height], dtype=torch.float32)
263
+ return boxes
264
+
265
+ def outputs_to_objects(self, outputs, img_size, id2label):
266
+ """Convert model outputs to structured objects."""
267
+ m = outputs.logits.softmax(-1).max(-1)
268
+ pred_labels = list(m.indices.detach().cpu().numpy())[0]
269
+ pred_scores = list(m.values.detach().cpu().numpy())[0]
270
+ pred_bboxes = outputs['pred_boxes'].detach().cpu()[0]
271
+ pred_bboxes = [elem.tolist() for elem in self.rescale_bboxes(pred_bboxes, img_size)]
272
+
273
+ objects = []
274
+ for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
275
+ class_label = id2label[int(label)]
276
+ if not class_label == 'no object':
277
+ objects.append({
278
+ 'label': class_label,
279
+ 'score': float(score),
280
+ 'bbox': [float(elem) for elem in bbox]
281
+ })
282
+ return objects
283
+
284
+ def detect(self, image_path: str, confidence: float = 0.5,
285
+ classes: Optional[List[str]] = None,
286
+ exclude_classes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
287
+ """
288
+ Detect tables and their structure in an image.
289
+
290
+ Args:
291
+ image_path: Path to the image to analyze
292
+ confidence: Minimum confidence threshold for detections
293
+ classes: Specific classes to detect, or None for all supported classes
294
+ exclude_classes: Classes to exclude from detection
295
+
296
+ Returns:
297
+ List of detected regions with their properties
298
+ """
299
+ # Validate requested classes
300
+ self.validate_classes(classes or [])
301
+
302
+ # Validate excluded classes
303
+ if exclude_classes:
304
+ self.validate_classes(exclude_classes)
305
+
306
+ # Load the image
307
+ image = Image.open(image_path).convert("RGB")
308
+
309
+ # Detect tables
310
+ pixel_values = self.detection_transform(image).unsqueeze(0).to(self.device)
311
+ with torch.no_grad():
312
+ outputs = self.detection_model(pixel_values)
313
+
314
+ id2label = self.detection_model.config.id2label
315
+ id2label[len(id2label)] = "no object"
316
+ tables = self.outputs_to_objects(outputs, image.size, id2label)
317
+
318
+ # Filter by confidence
319
+ tables = [t for t in tables if t['score'] >= confidence]
320
+
321
+ # If no tables found, return empty list
322
+ if not tables:
323
+ return []
324
+
325
+ # Process each table to find its structure
326
+ all_detections = []
327
+
328
+ # Add tables to detections if requested
329
+ if not classes or 'table' in classes:
330
+ if not exclude_classes or 'table' not in exclude_classes:
331
+ for table in tables:
332
+ all_detections.append({
333
+ 'bbox': tuple(table['bbox']),
334
+ 'class': 'table',
335
+ 'confidence': float(table['score']),
336
+ 'normalized_class': 'table'
337
+ })
338
+
339
+ # Process table structure if needed
340
+ structure_classes = {'table row', 'table column', 'table column header'}
341
+ needed_structure = False
342
+
343
+ # Check if we need to process structure
344
+ if not classes:
345
+ # No classes specified, detect all non-excluded
346
+ needed_structure = any(c not in (exclude_classes or []) for c in structure_classes)
347
+ else:
348
+ # Specific classes requested
349
+ needed_structure = any(c in classes for c in structure_classes)
350
+
351
+ if needed_structure:
352
+ for table in tables:
353
+ # Crop the table
354
+ x_min, y_min, x_max, y_max = table['bbox']
355
+ cropped_table = image.crop((x_min, y_min, x_max, y_max))
356
+
357
+ # Recognize table structure
358
+ structure_pixel_values = self.structure_transform(cropped_table).unsqueeze(0).to(self.device)
359
+ with torch.no_grad():
360
+ structure_outputs = self.structure_model(structure_pixel_values)
361
+
362
+ structure_id2label = self.structure_model.config.id2label
363
+ structure_id2label[len(structure_id2label)] = "no object"
364
+
365
+ # Get table structure elements
366
+ structure_elements = self.outputs_to_objects(structure_outputs, cropped_table.size, structure_id2label)
367
+
368
+ # Filter by confidence
369
+ structure_elements = [e for e in structure_elements if e['score'] >= confidence]
370
+
371
+ # Process each structure element
372
+ for element in structure_elements:
373
+ element_class = element['label']
374
+
375
+ # Skip if specific classes requested and this isn't one of them
376
+ if classes and element_class not in classes:
377
+ continue
378
+
379
+ # Skip if this class is in the excluded classes
380
+ if exclude_classes and element_class in exclude_classes:
381
+ continue
382
+
383
+ # Adjust coordinates to the original image (add table's top-left corner)
384
+ x_min_struct, y_min_struct, x_max_struct, y_max_struct = element['bbox']
385
+ adjusted_bbox = (
386
+ x_min_struct + x_min,
387
+ y_min_struct + y_min,
388
+ x_max_struct + x_min,
389
+ y_max_struct + y_min
390
+ )
391
+
392
+ all_detections.append({
393
+ 'bbox': adjusted_bbox,
394
+ 'class': element_class,
395
+ 'confidence': float(element['score']),
396
+ 'normalized_class': self._normalize_class_name(element_class)
397
+ })
398
+
399
+ return all_detections
400
+
401
+
402
+ class PaddleLayoutDetector(LayoutDetector):
403
+ """
404
+ Document layout and table structure detector using PaddlePaddle's PP-Structure.
405
+ """
406
+ def __init__(self,
407
+ lang: str = "en",
408
+ use_angle_cls: bool = False,
409
+ device: str = "cpu",
410
+ enable_table: bool = True,
411
+ show_log: bool = False,
412
+ detect_text: bool = True,
413
+ verbose: bool = False):
414
+ """
415
+ Initialize the PaddlePaddle layout detector.
416
+
417
+ Args:
418
+ lang: Language code for the detector ('en', 'ch', etc.)
419
+ use_angle_cls: Whether to use text orientation detection
420
+ device: Device to run inference on ('cpu' or 'gpu')
421
+ enable_table: Whether to use PP-Structure table detection
422
+ show_log: Whether to show PaddleOCR logs
423
+ detect_text: Whether to use direct text detection in addition to layout
424
+ verbose: Whether to show detailed detection information
425
+ """
426
+ # Set a module-specific logger
427
+ self.logger = logging.getLogger("natural_pdf.analyzers.layout.paddle")
428
+ # Store current level to restore it later
429
+ self.original_level = self.logger.level
430
+ # Set to DEBUG if verbose is True
431
+ if verbose:
432
+ self.logger.setLevel(logging.DEBUG)
433
+ super().__init__()
434
+ self.lang = lang
435
+ self.use_angle_cls = use_angle_cls
436
+ self.device = device
437
+ self.enable_table = enable_table
438
+ self.show_log = show_log
439
+ self.detect_text = detect_text
440
+ self.verbose = verbose
441
+ self._ppstructure = None
442
+
443
+ def __del__(self):
444
+ # Restore the original logging level
445
+ self.logger.setLevel(self.original_level)
446
+
447
+ # Validate PaddlePaddle availability
448
+ if not self._is_paddle_available():
449
+ raise ImportError(
450
+ "PaddlePaddle and PaddleOCR are required for PaddleLayoutDetector. "
451
+ "Please install them with: pip install paddlepaddle paddleocr"
452
+ )
453
+
454
+ # Supported classes by PP-Structure
455
+ self.supported_classes = {
456
+ 'text', 'title', 'figure', 'figure_caption',
457
+ 'table', 'table_caption', 'table_cell', 'table_row', 'table_column',
458
+ 'header', 'footer', 'reference', 'equation'
459
+ }
460
+
461
+ def _is_paddle_available(self) -> bool:
462
+ """Check if PaddlePaddle and PaddleOCR are installed."""
463
+ paddle_spec = importlib.util.find_spec("paddle")
464
+ paddleocr_spec = importlib.util.find_spec("paddleocr")
465
+ return paddle_spec is not None and paddleocr_spec is not None
466
+
467
+ @property
468
+ def ppstructure(self):
469
+ """Lazy-load the PP-Structure model."""
470
+ if self._ppstructure is None:
471
+ # Import here to avoid dependency if not used
472
+ from paddleocr import PPStructure
473
+
474
+ # Initialize PP-Structure with minimal settings
475
+ # Note: Paddleocr's PPStructure requires minimal parameters to work correctly
476
+ layout_config = {
477
+ 'show_log': self.show_log,
478
+ 'lang': self.lang
479
+ }
480
+
481
+ # Initialize PP-Structure with enhanced settings
482
+ self._ppstructure = PPStructure(**layout_config)
483
+ return self._ppstructure
484
+
485
+ def detect(self, image_path: str, confidence: float = 0.5,
486
+ classes: Optional[List[str]] = None,
487
+ exclude_classes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
488
+ """
489
+ Detect layout elements in an image using PaddlePaddle.
490
+
491
+ Args:
492
+ image_path: Path to the image to analyze
493
+ confidence: Minimum confidence threshold for detections
494
+ classes: Specific classes to detect, or None for all supported classes
495
+ exclude_classes: Classes to exclude from detection
496
+
497
+ Returns:
498
+ List of detected regions with their properties
499
+ """
500
+ self.logger.info(f"Starting PaddleLayout detection on {image_path}")
501
+ self.logger.debug(f"Parameters: confidence={confidence}, classes={classes}, exclude_classes={exclude_classes}, detect_text={self.detect_text}")
502
+ # Validate requested classes
503
+ self.validate_classes(classes or [])
504
+
505
+ # Validate excluded classes
506
+ if exclude_classes:
507
+ self.validate_classes(exclude_classes)
508
+
509
+ # Convert classes to lowercase for matching
510
+ classes_lower = [c.lower() for c in (classes or [])]
511
+ exclude_classes_lower = [c.lower() for c in (exclude_classes or [])]
512
+
513
+ # Process image with PP-Structure
514
+ try:
515
+ # Try to run PPStructure on the image directly
516
+ result = self.ppstructure(image_path)
517
+
518
+ # Debug output for troubleshooting
519
+ self.logger.debug(f"PaddleLayout detected {len(result)} regions")
520
+ for i, reg in enumerate(result):
521
+ self.logger.debug(f" Region {i+1}: type={reg.get('type', 'unknown')}, "
522
+ f"confidence={reg.get('score', 0.0)}, "
523
+ f"bbox={reg.get('bbox', [])}")
524
+ except Exception as e:
525
+ self.logger.error(f"Error in PaddleLayout detection: {e}")
526
+ return []
527
+
528
+ # If no results, return empty list
529
+ if not result:
530
+ self.logger.warning("PaddleLayout returned empty results")
531
+ return []
532
+
533
+ # Create detections list with the layout regions
534
+ detections = []
535
+
536
+ # Process standard layout results
537
+ for region in result:
538
+ try:
539
+ region_type = region.get('type', '').lower()
540
+
541
+ # Skip if specific classes requested and this isn't one of them
542
+ if classes and region_type not in classes_lower:
543
+ continue
544
+
545
+ # Skip if this class is in the excluded classes
546
+ if exclude_classes and region_type in exclude_classes_lower:
547
+ continue
548
+
549
+ # Get confidence score (default to 0.99 if not provided)
550
+ confidence_score = region.get('score', 0.99)
551
+
552
+ # Skip if confidence is below threshold
553
+ if confidence_score < confidence:
554
+ continue
555
+
556
+ # Get bounding box
557
+ bbox = region.get('bbox', [0, 0, 0, 0])
558
+ if len(bbox) < 4:
559
+ print(f"Invalid bbox format: {bbox}, skipping region")
560
+ continue
561
+
562
+ x_min, y_min, x_max, y_max = bbox[0], bbox[1], bbox[2], bbox[3]
563
+
564
+ # Normalize the class name for our system
565
+ if region_type == 'figure':
566
+ normalized_type = 'figure'
567
+ elif region_type in ('text', 'header', 'footer', 'reference'):
568
+ normalized_type = 'plain-text'
569
+ elif region_type == 'table':
570
+ normalized_type = 'table'
571
+ elif region_type == 'title':
572
+ normalized_type = 'title'
573
+ elif region_type == 'equation':
574
+ normalized_type = 'isolate-formula'
575
+ else:
576
+ normalized_type = region_type.replace(' ', '-')
577
+
578
+ # Add detection
579
+ detections.append({
580
+ 'bbox': (x_min, y_min, x_max, y_max),
581
+ 'class': region_type,
582
+ 'confidence': confidence_score,
583
+ 'normalized_class': normalized_type,
584
+ 'source': 'layout',
585
+ 'model': 'paddle'
586
+ })
587
+ except Exception as e:
588
+ self.logger.error(f"Error processing layout region: {e}, region data: {region}")
589
+
590
+ # Always add text box regions from the direct OCR if detect_text is enabled
591
+ if self.detect_text:
592
+ try:
593
+ # Import PaddleOCR
594
+ from paddleocr import PaddleOCR
595
+
596
+ # Use PaddleOCR directly for text detection only (no recognition for speed)
597
+ ocr = PaddleOCR(lang=self.lang, show_log=self.show_log)
598
+ ocr_result = ocr.ocr(image_path, det=True, rec=False, cls=False)
599
+
600
+ # Now add text box regions if available
601
+ if ocr_result and len(ocr_result) > 0 and len(ocr_result[0]) > 0:
602
+ text_boxes = ocr_result[0]
603
+ self.logger.debug(f"Adding {len(text_boxes)} text box regions from OCR detection")
604
+
605
+ for text_box in text_boxes:
606
+ try:
607
+ # Get box coordinates - these are actually lists of points, not lists of [box, text, confidence]
608
+ # when using det=True, rec=False
609
+ points = text_box
610
+
611
+ # When using det=True, rec=False, there's no text or confidence
612
+ # Just the polygon points, so we use default values
613
+ text = ""
614
+ text_confidence = 0.95 # High default confidence for detection
615
+
616
+ # Skip if confidence is below threshold
617
+ if text_confidence < confidence:
618
+ continue
619
+
620
+ # Calculate bounding box
621
+ x_coords = [p[0] for p in points]
622
+ y_coords = [p[1] for p in points]
623
+ x0, y0 = min(x_coords), min(y_coords)
624
+ x1, y1 = max(x_coords), max(y_coords)
625
+
626
+ # Add detection with original polygon points
627
+ detections.append({
628
+ 'bbox': (x0, y0, x1, y1),
629
+ 'class': 'text',
630
+ 'confidence': text_confidence,
631
+ 'normalized_class': 'plain-text',
632
+ 'polygon': points,
633
+ 'text': text,
634
+ 'source': 'ocr',
635
+ 'model': 'paddle'
636
+ })
637
+ except Exception as e:
638
+ self.logger.error(f"Error processing text box: {e}, box data: {text_box}")
639
+ except Exception as e:
640
+ self.logger.error(f"Error adding OCR text boxes: {e}")
641
+ # Continue with standard layout detection only
642
+
643
+ # Process table cells if available and not excluded
644
+ for region in result:
645
+ region_type = region.get('type', '').lower()
646
+
647
+ # Skip if not a table or table handling is disabled
648
+ if region_type != 'table' or not self.enable_table:
649
+ continue
650
+
651
+ # Get confidence score (default to 0.99 if not provided)
652
+ confidence_score = region.get('score', 0.99)
653
+
654
+ # Get bounding box for coordinate translation
655
+ bbox = region.get('bbox', [0, 0, 0, 0])
656
+ x_min, y_min = bbox[0], bbox[1]
657
+
658
+ # Process cells if available
659
+ if 'res' in region and isinstance(region['res'], dict) and 'cells' in region['res']:
660
+ cells = region['res']['cells']
661
+
662
+ # Process cells, rows, and columns if requested
663
+ process_cells = not classes or 'table_cell' in classes_lower
664
+ process_cells = process_cells and ('table_cell' not in exclude_classes_lower)
665
+
666
+ if process_cells:
667
+ for cell in cells:
668
+ # Convert cell coordinates to global coordinates
669
+ cell_bbox = cell.get('bbox', [0, 0, 0, 0])
670
+ cell_x_min = cell_bbox[0] + x_min
671
+ cell_y_min = cell_bbox[1] + y_min
672
+ cell_x_max = cell_bbox[2] + x_min
673
+ cell_y_max = cell_bbox[3] + y_min
674
+
675
+ # Add cell detection
676
+ detections.append({
677
+ 'bbox': (cell_x_min, cell_y_min, cell_x_max, cell_y_max),
678
+ 'class': 'table_cell',
679
+ 'confidence': confidence_score * 0.9, # Slightly lower confidence for cells
680
+ 'normalized_class': 'table-cell',
681
+ 'row_idx': cell.get('row_idx', 0),
682
+ 'col_idx': cell.get('col_idx', 0),
683
+ 'source': 'layout'
684
+ })
685
+
686
+ self.logger.info(f"PaddleLayout detection completed with {len(detections)} regions")
687
+ return detections
688
+
689
+
690
+ def convert_to_regions(page: Any, detections: List[Dict[str, Any]],
691
+ scale_factor: float = 1.0) -> List[Region]:
692
+ """
693
+ Convert layout detections to Region objects.
694
+
695
+ Args:
696
+ page: Page object to create regions for
697
+ detections: List of detection dictionaries
698
+ scale_factor: Factor to scale coordinates from image to PDF space
699
+
700
+ Returns:
701
+ List of Region objects with layout metadata
702
+ """
703
+ conversion_logger = logging.getLogger("natural_pdf.analyzers.layout.convert")
704
+ conversion_logger.debug(f"Converting {len(detections)} detections to regions with scale {scale_factor}")
705
+ regions = []
706
+
707
+ for det in detections:
708
+ # Extract detection info
709
+ x_min, y_min, x_max, y_max = det['bbox']
710
+
711
+ # Scale coordinates from image to PDF space
712
+ if scale_factor != 1.0:
713
+ x_min *= scale_factor
714
+ y_min *= scale_factor
715
+ x_max *= scale_factor
716
+ y_max *= scale_factor
717
+
718
+ # Create region with metadata
719
+ region = Region(page, (x_min, y_min, x_max, y_max))
720
+ region.region_type = det['class']
721
+ region.confidence = det['confidence']
722
+ region.normalized_type = det['normalized_class']
723
+
724
+ # Add source info - important for filtering
725
+ region.source = det.get('source', 'detected')
726
+ region.model = det.get('model', 'unknown')
727
+
728
+ # Add additional metadata if available
729
+ for key, value in det.items():
730
+ if key not in ('bbox', 'class', 'confidence', 'normalized_class', 'source', 'model'):
731
+ setattr(region, key, value)
732
+
733
+ regions.append(region)
734
+
735
+ conversion_logger.debug(f"Created {len(regions)} region objects from {len(detections)} detections")
736
+ return regions