natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,200 @@
1
+ # layout_manager.py
2
+ import logging
3
+ from typing import Dict, List, Any, Optional, Union, Type
4
+ from PIL import Image
5
+ import copy
6
+
7
+ # --- Import detector classes and options ---
8
+ # Use try-except blocks for robustness if some detectors might be missing dependencies
9
+ try:
10
+ from .base import LayoutDetector
11
+ except ImportError:
12
+ LayoutDetector = type('LayoutDetector', (), {})
13
+
14
+ try:
15
+ from .yolo import YOLODocLayoutDetector
16
+ except ImportError:
17
+ YOLODocLayoutDetector = None
18
+
19
+ try:
20
+ from .tatr import TableTransformerDetector
21
+ except ImportError:
22
+ TableTransformerDetector = None
23
+
24
+ try:
25
+ from .paddle import PaddleLayoutDetector
26
+ except ImportError:
27
+ PaddleLayoutDetector = None
28
+
29
+ try:
30
+ from .surya import SuryaLayoutDetector
31
+ except ImportError:
32
+ SuryaLayoutDetector = None
33
+
34
+ try:
35
+ from .docling import DoclingLayoutDetector
36
+ except ImportError:
37
+ DoclingLayoutDetector = None
38
+
39
+ from .layout_options import (
40
+ BaseLayoutOptions, YOLOLayoutOptions, TATRLayoutOptions,
41
+ PaddleLayoutOptions, SuryaLayoutOptions, DoclingLayoutOptions, LayoutOptions
42
+ )
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+ class LayoutManager:
47
+ """Manages layout detector selection, configuration, and execution."""
48
+
49
+ # Registry mapping engine names to classes and default options
50
+ ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {}
51
+
52
+ # Populate registry only with available detectors
53
+ if YOLODocLayoutDetector: ENGINE_REGISTRY['yolo'] = {'class': YOLODocLayoutDetector, 'options_class': YOLOLayoutOptions}
54
+ if TableTransformerDetector: ENGINE_REGISTRY['tatr'] = {'class': TableTransformerDetector, 'options_class': TATRLayoutOptions}
55
+ if PaddleLayoutDetector: ENGINE_REGISTRY['paddle'] = {'class': PaddleLayoutDetector, 'options_class': PaddleLayoutOptions}
56
+ if SuryaLayoutDetector: ENGINE_REGISTRY['surya'] = {'class': SuryaLayoutDetector, 'options_class': SuryaLayoutOptions}
57
+ if DoclingLayoutDetector: ENGINE_REGISTRY['docling'] = {'class': DoclingLayoutDetector, 'options_class': DoclingLayoutOptions}
58
+
59
+ # Define the limited set of kwargs allowed for the simple analyze_layout call
60
+ SIMPLE_MODE_ALLOWED_KWARGS = {
61
+ 'engine', 'confidence', 'classes', 'exclude_classes', 'device'
62
+ }
63
+
64
+ def __init__(self):
65
+ """Initializes the Layout Manager."""
66
+ # Cache for detector instances (different from model cache inside detector)
67
+ self._detector_instances: Dict[str, LayoutDetector] = {}
68
+ logger.info(f"LayoutManager initialized. Available engines: {list(self.ENGINE_REGISTRY.keys())}")
69
+
70
+ def _get_engine_instance(self, engine_name: str) -> LayoutDetector:
71
+ """Retrieves or creates an instance of the specified layout detector."""
72
+ engine_name = engine_name.lower()
73
+ if engine_name not in self.ENGINE_REGISTRY:
74
+ raise ValueError(f"Unknown layout engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
75
+
76
+ if engine_name not in self._detector_instances:
77
+ logger.info(f"Creating instance of layout engine: {engine_name}")
78
+ engine_class = self.ENGINE_REGISTRY[engine_name]['class']
79
+ detector_instance = engine_class() # Instantiate
80
+ if not detector_instance.is_available():
81
+ # Check availability before storing
82
+ raise RuntimeError(f"Layout engine '{engine_name}' is not available. Please check dependencies.")
83
+ self._detector_instances[engine_name] = detector_instance # Store if available
84
+
85
+ return self._detector_instances[engine_name]
86
+
87
+ def analyze_layout(
88
+ self,
89
+ image: Image.Image,
90
+ engine: Optional[str] = None, # Default engine handled below
91
+ options: Optional[LayoutOptions] = None,
92
+ **kwargs
93
+ ) -> List[Dict[str, Any]]:
94
+ """
95
+ Analyzes layout of a single image using simple args or an options object.
96
+
97
+ Args:
98
+ image: The PIL Image to analyze.
99
+ engine: Name of the engine (e.g., 'yolo', 'tatr'). Ignored if 'options' provided.
100
+ Defaults to the first available engine if None.
101
+ options: Specific LayoutOptions object for advanced configuration.
102
+ **kwargs: For simple mode, accepts: 'confidence', 'classes',
103
+ 'exclude_classes', 'device'.
104
+
105
+ Returns:
106
+ A list of standardized detection dictionaries.
107
+ """
108
+ final_options: BaseLayoutOptions
109
+ selected_engine_name: str
110
+
111
+ if not isinstance(image, Image.Image):
112
+ raise TypeError("Input 'image' must be a PIL Image.")
113
+
114
+ available_engines = self.get_available_engines()
115
+ if not available_engines:
116
+ raise RuntimeError("No layout engines are available. Please check dependencies.")
117
+
118
+ # Determine default engine if not specified
119
+ default_engine = engine if engine else available_engines[0]
120
+
121
+ # --- Determine Options and Engine ---
122
+ if options is not None:
123
+ # Advanced Mode
124
+ logger.debug(f"LayoutManager: Using advanced mode with options object: {type(options).__name__}")
125
+ final_options = copy.deepcopy(options) # Use copy
126
+ found_engine = False
127
+ for name, registry_entry in self.ENGINE_REGISTRY.items():
128
+ if isinstance(options, registry_entry['options_class']):
129
+ selected_engine_name = name
130
+ found_engine = True
131
+ break
132
+ if not found_engine:
133
+ raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options.")
134
+ if kwargs:
135
+ logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored.")
136
+ else:
137
+ # Simple Mode
138
+ selected_engine_name = default_engine.lower()
139
+ logger.debug(f"LayoutManager: Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}")
140
+
141
+ if selected_engine_name not in self.ENGINE_REGISTRY:
142
+ raise ValueError(f"Unknown or unavailable layout engine: '{selected_engine_name}'. Available: {available_engines}")
143
+
144
+ unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
145
+ if unexpected_kwargs:
146
+ raise TypeError(f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration.")
147
+
148
+ options_class = self.ENGINE_REGISTRY[selected_engine_name]['options_class']
149
+ # Use BaseLayoutOptions defaults unless overridden by kwargs
150
+ base_defaults = BaseLayoutOptions()
151
+ simple_args = {
152
+ 'confidence': kwargs.get('confidence', base_defaults.confidence),
153
+ 'classes': kwargs.get('classes'),
154
+ 'exclude_classes': kwargs.get('exclude_classes'),
155
+ 'device': kwargs.get('device', base_defaults.device)
156
+ }
157
+ # Filter out None values before passing to constructor
158
+ simple_args_filtered = {k: v for k, v in simple_args.items() if v is not None}
159
+ final_options = options_class(**simple_args_filtered)
160
+ logger.debug(f"LayoutManager: Constructed options for simple mode: {final_options}")
161
+
162
+
163
+ # --- Get Engine Instance and Process ---
164
+ try:
165
+ engine_instance = self._get_engine_instance(selected_engine_name)
166
+ logger.info(f"Analyzing layout with engine '{selected_engine_name}'...")
167
+
168
+ # Call the engine's detect method
169
+ detections = engine_instance.detect(image, final_options)
170
+
171
+ logger.info(f"Layout analysis complete. Found {len(detections)} regions.")
172
+ return detections
173
+
174
+ except (ImportError, RuntimeError, ValueError, TypeError) as e:
175
+ logger.error(f"Layout analysis failed for engine '{selected_engine_name}': {e}", exc_info=True)
176
+ raise # Re-raise expected errors
177
+ except Exception as e:
178
+ logger.error(f"An unexpected error occurred during layout analysis: {e}", exc_info=True)
179
+ raise # Re-raise unexpected errors
180
+
181
+
182
+ def get_available_engines(self) -> List[str]:
183
+ """Returns a list of registered layout engine names that are currently available."""
184
+ available = []
185
+ for name, registry_entry in self.ENGINE_REGISTRY.items():
186
+ try:
187
+ engine_class = registry_entry['class']
188
+ # Check availability without full instantiation if possible
189
+ if hasattr(engine_class, 'is_available') and callable(engine_class.is_available):
190
+ # Create temporary instance only for check if needed, or use classmethod
191
+ if engine_class().is_available(): # Assumes instance needed for check
192
+ available.append(name)
193
+ else:
194
+ # Assume available if class exists (less robust)
195
+ available.append(name)
196
+ except Exception as e:
197
+ logger.debug(f"Layout engine '{name}' check failed: {e}")
198
+ pass
199
+ return available
200
+
@@ -0,0 +1,78 @@
1
+ # layout_options.py
2
+ import logging
3
+ from dataclasses import dataclass, field
4
+ from typing import List, Optional, Dict, Any, Tuple, Union
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ # --- Base Layout Options ---
9
+ @dataclass
10
+ class BaseLayoutOptions:
11
+ """Base options for layout detection engines."""
12
+ confidence: float = 0.5 # Minimum confidence threshold for detections
13
+ classes: Optional[List[str]] = None # Specific classes to detect (None for all)
14
+ exclude_classes: Optional[List[str]] = None # Classes to exclude
15
+ device: Optional[str] = 'cpu' # Preferred device ('cpu', 'cuda', 'mps', etc.)
16
+ extra_args: Dict[str, Any] = field(default_factory=dict) # For engine-specific args not yet fields
17
+
18
+ # --- YOLO Specific Options ---
19
+ @dataclass
20
+ class YOLOLayoutOptions(BaseLayoutOptions):
21
+ """Options specific to YOLO-based layout detection."""
22
+ model_repo: str = "juliozhao/DocLayout-YOLO-DocStructBench"
23
+ model_file: str = "doclayout_yolo_docstructbench_imgsz1024.pt"
24
+ image_size: int = 1024 # Input image size for the model
25
+
26
+ # --- TATR Specific Options ---
27
+ @dataclass
28
+ class TATRLayoutOptions(BaseLayoutOptions):
29
+ """Options specific to Table Transformer (TATR) layout detection."""
30
+ # Which models to use (can be local paths or HF identifiers)
31
+ detection_model: str = "microsoft/table-transformer-detection"
32
+ structure_model: str = "microsoft/table-transformer-structure-recognition-v1.1-all"
33
+ # Input image resizing parameters
34
+ max_detection_size: int = 800
35
+ max_structure_size: int = 1000
36
+ # Whether to create cell regions (can be slow)
37
+ create_cells: bool = False # Keep the flag for cell creation control
38
+
39
+ # --- Paddle Specific Options ---
40
+ @dataclass
41
+ class PaddleLayoutOptions(BaseLayoutOptions):
42
+ """Options specific to PaddlePaddle PP-Structure layout detection."""
43
+ lang: str = "en" # Language ('en', 'ch', etc.)
44
+ use_angle_cls: bool = False # Use text angle classification?
45
+ enable_table: bool = True # Enable table structure detection?
46
+ show_log: bool = False # Show Paddle internal logs?
47
+ detect_text: bool = True # Also detect raw text boxes using PaddleOCR?
48
+ verbose: bool = False # Verbose logging for the detector class
49
+
50
+ # --- Surya Specific Options ---
51
+ @dataclass
52
+ class SuryaLayoutOptions(BaseLayoutOptions):
53
+ """Options specific to Surya layout detection."""
54
+ # Surya doesn't seem to have many config options based on the example,
55
+ # but we can add placeholders if needed. Device is handled by BaseLayoutOptions.
56
+ model_name: str = "default" # Placeholder if different models become available
57
+ verbose: bool = False # Verbose logging for the detector class
58
+
59
+ # --- Docling Specific Options ---
60
+ @dataclass
61
+ class DoclingLayoutOptions(BaseLayoutOptions):
62
+ """Options specific to Docling layout detection."""
63
+ # Pass kwargs directly to Docling's DocumentConverter via extra_args
64
+ # Common examples shown here for documentation, add others as needed to extra_args
65
+ # model_name: str = "ds4sd/SmolDocling-256M-preview" # Example model (pass via extra_args)
66
+ # prompt_text: Optional[str] = None # Optional prompt (pass via extra_args)
67
+ verbose: bool = False # Verbose logging for the detector class
68
+ # Other kwargs like 'device', 'batch_size' can go in extra_args
69
+
70
+ # --- Union Type ---
71
+ LayoutOptions = Union[
72
+ YOLOLayoutOptions,
73
+ TATRLayoutOptions,
74
+ PaddleLayoutOptions,
75
+ SuryaLayoutOptions,
76
+ DoclingLayoutOptions,
77
+ BaseLayoutOptions # Include base for typing flexibility
78
+ ]
@@ -0,0 +1,240 @@
1
+ # layout_detector_paddle.py
2
+ import logging
3
+ import importlib.util
4
+ import os
5
+ import tempfile
6
+ from typing import List, Dict, Any, Optional
7
+ from PIL import Image
8
+
9
+ # Assuming base class and options are importable
10
+ try:
11
+ from .base import LayoutDetector
12
+ from .layout_options import PaddleLayoutOptions, BaseLayoutOptions
13
+ except ImportError:
14
+ # Placeholders if run standalone or imports fail
15
+ class BaseLayoutOptions: pass
16
+ class PaddleLayoutOptions(BaseLayoutOptions): pass
17
+ class LayoutDetector:
18
+ def __init__(self): self.logger=logging.getLogger(); self.supported_classes=set()
19
+ def _get_model(self, options): raise NotImplementedError
20
+ def _normalize_class_name(self, n): return n
21
+ def validate_classes(self, c): pass
22
+ logging.basicConfig()
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Check for dependencies
27
+ paddle_spec = importlib.util.find_spec("paddle") or importlib.util.find_spec("paddlepaddle")
28
+ paddleocr_spec = importlib.util.find_spec("paddleocr")
29
+ PPStructure = None
30
+ PaddleOCR = None # For optional text detection
31
+
32
+ if paddle_spec and paddleocr_spec:
33
+ try:
34
+ from paddleocr import PPStructure, PaddleOCR
35
+ except ImportError as e:
36
+ logger.warning(f"Could not import Paddle dependencies: {e}")
37
+ else:
38
+ logger.warning("paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available.")
39
+
40
+
41
+ class PaddleLayoutDetector(LayoutDetector):
42
+ """Document layout and table structure detector using PaddlePaddle's PP-Structure."""
43
+
44
+ def __init__(self):
45
+ super().__init__()
46
+ # Supported classes by PP-Structure (adjust based on model version/capabilities)
47
+ self.supported_classes = {
48
+ 'text', 'title', 'figure', 'figure_caption',
49
+ 'table', 'table_caption', 'table_cell', # Added table_cell
50
+ 'header', 'footer', 'reference', 'equation',
51
+ # PP-StructureV2 might add others like list, pub_number etc.
52
+ }
53
+ # Models are loaded via _get_model
54
+
55
+ def is_available(self) -> bool:
56
+ """Check if dependencies are installed."""
57
+ return PPStructure is not None and PaddleOCR is not None
58
+
59
+ def _get_cache_key(self, options: BaseLayoutOptions) -> str:
60
+ """Generate cache key based on language and device."""
61
+ if not isinstance(options, PaddleLayoutOptions):
62
+ options = PaddleLayoutOptions(device=options.device) # Use base device
63
+
64
+ device_key = str(options.device).lower() if options.device else 'default_device'
65
+ lang_key = options.lang
66
+ # Key could also include enable_table, use_angle_cls if these affect model loading fundamentally
67
+ # For PPStructure, they are primarily runtime flags, so lang/device might suffice for caching the *instance*.
68
+ return f"{self.__class__.__name__}_{device_key}_{lang_key}"
69
+
70
+ def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
71
+ """Load the PPStructure model based on options."""
72
+ if not self.is_available():
73
+ raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
74
+
75
+ if not isinstance(options, PaddleLayoutOptions):
76
+ raise TypeError("Incorrect options type provided for Paddle model loading.")
77
+
78
+ self.logger.info(f"Loading PPStructure model (lang={options.lang}, device={options.device}, table={options.enable_table})...")
79
+ try:
80
+ # PPStructure init takes several arguments that control runtime behavior
81
+ # We cache the instance based on lang/device, assuming other flags don't require reloading.
82
+ # Note: show_log is a runtime arg, not needed for instance caching key.
83
+ # Note: `layout=False` disables layout analysis, which we definitely want here.
84
+ # Note: `ocr=False` might disable text detection needed for table structure? Check PPStructure docs.
85
+ # It seems best to initialize with core settings and pass others during the call if possible.
86
+ # However, PPStructure call signature is simple (__call__(self, img, ...))
87
+ # So, we likely need to initialize with most settings.
88
+ model_instance = PPStructure(
89
+ lang=options.lang,
90
+ use_gpu=('cuda' in str(options.device).lower() or 'gpu' in str(options.device).lower()),
91
+ use_angle_cls=options.use_angle_cls,
92
+ show_log=options.show_log,
93
+ layout=True, # Ensure layout analysis is on
94
+ table=options.enable_table, # Control table analysis
95
+ ocr=False # Usually disable internal OCR if only using for layout/table
96
+ # Add other PPStructure init args from options.extra_args if needed
97
+ # **options.extra_args
98
+ )
99
+ self.logger.info("PPStructure model loaded.")
100
+ return model_instance
101
+ except Exception as e:
102
+ self.logger.error(f"Failed to load PPStructure model: {e}", exc_info=True)
103
+ raise
104
+
105
+ def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
106
+ """Detect layout elements in an image using PaddlePaddle."""
107
+ if not self.is_available():
108
+ raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
109
+
110
+ if not isinstance(options, PaddleLayoutOptions):
111
+ self.logger.warning("Received BaseLayoutOptions, expected PaddleLayoutOptions. Using defaults.")
112
+ options = PaddleLayoutOptions(
113
+ confidence=options.confidence, classes=options.classes,
114
+ exclude_classes=options.exclude_classes, device=options.device,
115
+ extra_args=options.extra_args
116
+ # Other Paddle options will use defaults
117
+ )
118
+
119
+ self.validate_classes(options.classes or [])
120
+ if options.exclude_classes:
121
+ self.validate_classes(options.exclude_classes)
122
+
123
+ # Get the cached/loaded PPStructure instance
124
+ ppstructure_instance = self._get_model(options)
125
+
126
+ # PPStructure call requires an image path. Save temp file.
127
+ detections = []
128
+ with tempfile.TemporaryDirectory() as temp_dir:
129
+ temp_image_path = os.path.join(temp_dir, f"paddle_input_{os.getpid()}.png")
130
+ try:
131
+ self.logger.debug(f"Saving temporary image for Paddle detector to: {temp_image_path}")
132
+ image.convert("RGB").save(temp_image_path) # Ensure RGB
133
+
134
+ # Process image with PP-Structure instance
135
+ # The instance was configured during _load_model_from_options
136
+ self.logger.debug("Running PPStructure analysis...")
137
+ result = ppstructure_instance(temp_image_path)
138
+ self.logger.debug(f"PPStructure returned {len(result)} regions.")
139
+
140
+ except Exception as e:
141
+ self.logger.error(f"Error during PPStructure analysis: {e}", exc_info=True)
142
+ # Clean up temp file before raising or returning
143
+ if os.path.exists(temp_image_path):
144
+ try: os.remove(temp_image_path)
145
+ except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
146
+ raise # Re-raise error
147
+
148
+ finally:
149
+ # Ensure cleanup even if analysis worked
150
+ if os.path.exists(temp_image_path):
151
+ try: os.remove(temp_image_path)
152
+ except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
153
+
154
+ # --- Process Results ---
155
+ if not result:
156
+ self.logger.warning("PaddleLayout returned empty results")
157
+ return []
158
+
159
+ # Prepare normalized class filters once
160
+ normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
161
+ normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
162
+
163
+ for region in result:
164
+ try:
165
+ region_type_orig = region.get('type', 'unknown')
166
+ # Handle potential list returns for type (seen in some versions)
167
+ if isinstance(region_type_orig, list):
168
+ region_type_orig = region_type_orig[0] if region_type_orig else 'unknown'
169
+
170
+ region_type = region_type_orig.lower()
171
+ normalized_class = self._normalize_class_name(region_type)
172
+
173
+ # Apply class filtering
174
+ if normalized_classes_req and normalized_class not in normalized_classes_req: continue
175
+ if normalized_class in normalized_classes_excl: continue
176
+
177
+ # PP-Structure results don't always have confidence, use threshold or default
178
+ confidence_score = region.get('score', 1.0) # Default to 1.0 if missing
179
+ if confidence_score < options.confidence: continue
180
+
181
+ bbox = region.get('bbox')
182
+ if not bbox or len(bbox) != 4:
183
+ self.logger.warning(f"Skipping region with invalid bbox: {region}")
184
+ continue
185
+ x_min, y_min, x_max, y_max = map(float, bbox)
186
+
187
+ # Add detection
188
+ detection_data = {
189
+ 'bbox': (x_min, y_min, x_max, y_max),
190
+ 'class': region_type_orig, # Keep original case if needed
191
+ 'confidence': confidence_score,
192
+ 'normalized_class': normalized_class,
193
+ 'source': 'layout',
194
+ 'model': 'paddle'
195
+ }
196
+ detections.append(detection_data)
197
+
198
+ # --- Process Table Cells (if enabled and present) ---
199
+ if region_type == 'table' and options.enable_table and 'res' in region:
200
+ process_cells = (normalized_classes_req is None or 'table-cell' in normalized_classes_req) and \
201
+ ('table-cell' not in normalized_classes_excl)
202
+
203
+ if process_cells and isinstance(region['res'], list): # V2 structure
204
+ for cell in region['res']:
205
+ if 'box' not in cell or len(cell['box']) != 4: continue
206
+ cell_bbox = cell['box']
207
+ cell_x_min, cell_y_min, cell_x_max, cell_y_max = map(float, cell_bbox)
208
+ # Add cell detection (confidence often not available per cell)
209
+ detections.append({
210
+ 'bbox': (cell_x_min, cell_y_min, cell_x_max, cell_y_max),
211
+ 'class': 'table cell', # Standardize name
212
+ 'confidence': confidence_score * 0.95, # Inherit table confidence (slightly reduced)
213
+ 'normalized_class': 'table-cell',
214
+ 'text': cell.get('text', ''), # Include text if available
215
+ 'source': 'layout', 'model': 'paddle'
216
+ })
217
+ elif process_cells and isinstance(region['res'], dict) and 'cells' in region['res']: # Older structure
218
+ # Handle older 'cells' list if needed (logic from original file)
219
+ pass # Add logic based on original paddle.txt if supporting older PP-Structure
220
+
221
+ except (TypeError, KeyError, IndexError, ValueError) as e:
222
+ self.logger.warning(f"Error processing Paddle region: {region}. Error: {e}")
223
+ continue
224
+
225
+ # --- Optional: Add Text Boxes from separate OCR run ---
226
+ if options.detect_text:
227
+ # This requires another model instance (PaddleOCR) and adds complexity.
228
+ # Consider if this is truly needed or if layout regions are sufficient.
229
+ # If needed, implement similar to original paddle.txt:
230
+ # - Instantiate PaddleOCR (potentially cache separately)
231
+ # - Run ocr(img_path, det=True, rec=False)
232
+ # - Process results, adding 'text' class detections
233
+ self.logger.info("Paddle detect_text=True: Running separate OCR text detection...")
234
+ # (Implementation omitted for brevity - requires PaddleOCR instance)
235
+ pass
236
+
237
+
238
+ self.logger.info(f"PaddleLayout detected {len(detections)} layout elements matching criteria.")
239
+ return detections
240
+
@@ -0,0 +1,151 @@
1
+ # layout_detector_surya.py
2
+ import logging
3
+ import importlib.util
4
+ import os
5
+ import tempfile
6
+ from typing import List, Dict, Any, Optional, Tuple
7
+ from PIL import Image
8
+
9
+ from .base import LayoutDetector
10
+ from .layout_options import SuryaLayoutOptions, BaseLayoutOptions
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Check for dependency
15
+ surya_spec = importlib.util.find_spec("surya")
16
+ LayoutPredictor = None
17
+ if surya_spec:
18
+ try:
19
+ from surya.layout import LayoutPredictor
20
+ except ImportError as e:
21
+ logger.warning(f"Could not import Surya dependencies: {e}")
22
+ else:
23
+ logger.warning("surya not found. SuryaLayoutDetector will not be available.")
24
+
25
+
26
+ class SuryaLayoutDetector(LayoutDetector):
27
+ """Document layout detector using Surya models."""
28
+
29
+ def __init__(self):
30
+ super().__init__()
31
+ self.supported_classes = {
32
+ 'text', 'pageheader', 'pagefooter', 'sectionheader',
33
+ 'table', 'tableofcontents', 'picture', 'caption',
34
+ 'heading', 'title', 'list', 'listitem', 'code',
35
+ 'textinlinemath', 'mathformula', 'form'
36
+ }
37
+ # Predictor instance is cached via _get_model
38
+
39
+ def is_available(self) -> bool:
40
+ """Check if surya is installed."""
41
+ return LayoutPredictor is not None
42
+
43
+ def _get_cache_key(self, options: BaseLayoutOptions) -> str:
44
+ """Generate cache key based on model name and device."""
45
+ if not isinstance(options, SuryaLayoutOptions):
46
+ options = SuryaLayoutOptions(device=options.device) # Use base device
47
+
48
+ device_key = str(options.device).lower() if options.device else 'default_device'
49
+ # Include model_name if it affects loading, otherwise device might be enough
50
+ model_key = options.model_name
51
+ return f"{self.__class__.__name__}_{device_key}_{model_key}"
52
+
53
+ def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
54
+ """Load the Surya LayoutPredictor model."""
55
+ if not self.is_available():
56
+ raise RuntimeError("Surya dependency (surya-ocr) not installed.")
57
+
58
+ if not isinstance(options, SuryaLayoutOptions):
59
+ raise TypeError("Incorrect options type provided for Surya model loading.")
60
+
61
+ self.logger.info(f"Loading Surya LayoutPredictor (device={options.device})...")
62
+ try:
63
+ # Pass device and potentially other init args from options.extra_args
64
+ predictor_args = {'device': options.device} if options.device else {}
65
+ predictor_args.update(options.extra_args) # Add any extra init args
66
+
67
+ predictor = LayoutPredictor(**predictor_args)
68
+ self.logger.info("Surya LayoutPredictor loaded.")
69
+ return predictor
70
+ except Exception as e:
71
+ self.logger.error(f"Failed to load Surya LayoutPredictor: {e}", exc_info=True)
72
+ raise
73
+
74
+ def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
75
+ """Detect layout elements in an image using Surya."""
76
+ if not self.is_available():
77
+ raise RuntimeError("Surya dependency (surya-ocr) not installed.")
78
+
79
+ if not isinstance(options, SuryaLayoutOptions):
80
+ self.logger.warning("Received BaseLayoutOptions, expected SuryaLayoutOptions. Using defaults.")
81
+ options = SuryaLayoutOptions(
82
+ confidence=options.confidence, classes=options.classes,
83
+ exclude_classes=options.exclude_classes, device=options.device,
84
+ extra_args=options.extra_args
85
+ )
86
+
87
+ self.validate_classes(options.classes or [])
88
+ if options.exclude_classes:
89
+ self.validate_classes(options.exclude_classes)
90
+
91
+ # Get the cached/loaded predictor instance
92
+ layout_predictor = self._get_model(options)
93
+
94
+ # Surya predictor takes a list of images
95
+ input_image_list = [image.convert("RGB")] # Ensure RGB
96
+
97
+ detections = []
98
+ try:
99
+ self.logger.debug("Running Surya layout prediction...")
100
+ # Call the predictor (returns a list of LayoutResult objects)
101
+ layout_predictions = layout_predictor(input_image_list)
102
+ self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
103
+
104
+ if not layout_predictions:
105
+ self.logger.warning("Surya returned empty predictions list.")
106
+ return []
107
+
108
+ # Process results for the first (and only) image
109
+ prediction = layout_predictions[0] # LayoutResult object
110
+
111
+ # Prepare normalized class filters once
112
+ normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
113
+ normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
114
+
115
+ for layout_box in prediction.bboxes:
116
+ # Extract the class name and normalize it
117
+ class_name_orig = layout_box.label
118
+ normalized_class = self._normalize_class_name(class_name_orig)
119
+ score = float(layout_box.confidence)
120
+
121
+ # Apply confidence threshold
122
+ if score < options.confidence: continue
123
+
124
+ # Apply class filtering
125
+ if normalized_classes_req and normalized_class not in normalized_classes_req: continue
126
+ if normalized_class in normalized_classes_excl: continue
127
+
128
+ # Extract bbox coordinates (Surya provides [x_min, y_min, x_max, y_max])
129
+ x_min, y_min, x_max, y_max = map(float, layout_box.bbox)
130
+
131
+ # Add detection
132
+ detection_data = {
133
+ 'bbox': (x_min, y_min, x_max, y_max),
134
+ 'class': class_name_orig,
135
+ 'confidence': score,
136
+ 'normalized_class': normalized_class,
137
+ 'source': 'layout',
138
+ 'model': 'surya'
139
+ # Add polygon etc. if needed, check attributes on layout_box
140
+ # 'polygon': layout_box.polygon if hasattr(layout_box, 'polygon') else None,
141
+ }
142
+ detections.append(detection_data)
143
+
144
+ self.logger.info(f"Surya detected {len(detections)} layout elements matching criteria.")
145
+
146
+ except Exception as e:
147
+ self.logger.error(f"Error during Surya layout detection: {e}", exc_info=True)
148
+ raise
149
+
150
+ return detections
151
+