PyPI - natural-pdf - Versions diffs - 0.1.0__py3-none-any.whl - Mend

natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

natural_pdf/__init__.py +55 -0
natural_pdf/analyzers/__init__.py +6 -0
natural_pdf/analyzers/layout/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +151 -0
natural_pdf/analyzers/layout/docling.py +247 -0
natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
natural_pdf/analyzers/layout/layout_manager.py +200 -0
natural_pdf/analyzers/layout/layout_options.py +78 -0
natural_pdf/analyzers/layout/paddle.py +240 -0
natural_pdf/analyzers/layout/surya.py +151 -0
natural_pdf/analyzers/layout/tatr.py +251 -0
natural_pdf/analyzers/layout/yolo.py +165 -0
natural_pdf/analyzers/text_options.py +60 -0
natural_pdf/analyzers/text_structure.py +270 -0
natural_pdf/analyzers/utils.py +57 -0
natural_pdf/core/__init__.py +3 -0
natural_pdf/core/element_manager.py +457 -0
natural_pdf/core/highlighting_service.py +698 -0
natural_pdf/core/page.py +1444 -0
natural_pdf/core/pdf.py +653 -0
natural_pdf/elements/__init__.py +3 -0
natural_pdf/elements/base.py +761 -0
natural_pdf/elements/collections.py +1345 -0
natural_pdf/elements/line.py +140 -0
natural_pdf/elements/rect.py +122 -0
natural_pdf/elements/region.py +1793 -0
natural_pdf/elements/text.py +304 -0
natural_pdf/ocr/__init__.py +56 -0
natural_pdf/ocr/engine.py +104 -0
natural_pdf/ocr/engine_easyocr.py +179 -0
natural_pdf/ocr/engine_paddle.py +204 -0
natural_pdf/ocr/engine_surya.py +171 -0
natural_pdf/ocr/ocr_manager.py +191 -0
natural_pdf/ocr/ocr_options.py +114 -0
natural_pdf/qa/__init__.py +3 -0
natural_pdf/qa/document_qa.py +396 -0
natural_pdf/selectors/__init__.py +4 -0
natural_pdf/selectors/parser.py +354 -0
natural_pdf/templates/__init__.py +1 -0
natural_pdf/templates/ocr_debug.html +517 -0
natural_pdf/utils/__init__.py +3 -0
natural_pdf/utils/highlighting.py +12 -0
natural_pdf/utils/reading_order.py +227 -0
natural_pdf/utils/visualization.py +223 -0
natural_pdf/widgets/__init__.py +4 -0
natural_pdf/widgets/frontend/viewer.js +88 -0
natural_pdf/widgets/viewer.py +765 -0
natural_pdf-0.1.0.dist-info/METADATA +295 -0
natural_pdf-0.1.0.dist-info/RECORD +52 -0
natural_pdf-0.1.0.dist-info/WHEEL +5 -0
natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
natural_pdf-0.1.0.dist-info/top_level.txt +1 -0

natural_pdf/analyzers/layout/layout_manager.py ADDED Viewed

@@ -0,0 +1,200 @@
+# layout_manager.py
+import logging
+from typing import Dict, List, Any, Optional, Union, Type
+from PIL import Image
+import copy
+# --- Import detector classes and options ---
+# Use try-except blocks for robustness if some detectors might be missing dependencies
+try:
+    from .base import LayoutDetector
+except ImportError:
+    LayoutDetector = type('LayoutDetector', (), {})
+try:
+    from .yolo import YOLODocLayoutDetector
+except ImportError:
+    YOLODocLayoutDetector = None
+try:
+    from .tatr import TableTransformerDetector
+except ImportError:
+    TableTransformerDetector = None
+try:
+    from .paddle import PaddleLayoutDetector
+except ImportError:
+    PaddleLayoutDetector = None
+try:
+    from .surya import SuryaLayoutDetector
+except ImportError:
+    SuryaLayoutDetector = None
+try:
+    from .docling import DoclingLayoutDetector
+except ImportError:
+    DoclingLayoutDetector = None
+from .layout_options import (
+    BaseLayoutOptions, YOLOLayoutOptions, TATRLayoutOptions,
+    PaddleLayoutOptions, SuryaLayoutOptions, DoclingLayoutOptions, LayoutOptions
+)
+logger = logging.getLogger(__name__)
+class LayoutManager:
+    """Manages layout detector selection, configuration, and execution."""
+    # Registry mapping engine names to classes and default options
+    ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {}
+    # Populate registry only with available detectors
+    if YOLODocLayoutDetector: ENGINE_REGISTRY['yolo'] = {'class': YOLODocLayoutDetector, 'options_class': YOLOLayoutOptions}
+    if TableTransformerDetector: ENGINE_REGISTRY['tatr'] = {'class': TableTransformerDetector, 'options_class': TATRLayoutOptions}
+    if PaddleLayoutDetector: ENGINE_REGISTRY['paddle'] = {'class': PaddleLayoutDetector, 'options_class': PaddleLayoutOptions}
+    if SuryaLayoutDetector: ENGINE_REGISTRY['surya'] = {'class': SuryaLayoutDetector, 'options_class': SuryaLayoutOptions}
+    if DoclingLayoutDetector: ENGINE_REGISTRY['docling'] = {'class': DoclingLayoutDetector, 'options_class': DoclingLayoutOptions}
+    # Define the limited set of kwargs allowed for the simple analyze_layout call
+    SIMPLE_MODE_ALLOWED_KWARGS = {
+        'engine', 'confidence', 'classes', 'exclude_classes', 'device'
+    }
+    def __init__(self):
+        """Initializes the Layout Manager."""
+        # Cache for detector instances (different from model cache inside detector)
+        self._detector_instances: Dict[str, LayoutDetector] = {}
+        logger.info(f"LayoutManager initialized. Available engines: {list(self.ENGINE_REGISTRY.keys())}")
+    def _get_engine_instance(self, engine_name: str) -> LayoutDetector:
+        """Retrieves or creates an instance of the specified layout detector."""
+        engine_name = engine_name.lower()
+        if engine_name not in self.ENGINE_REGISTRY:
+            raise ValueError(f"Unknown layout engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
+        if engine_name not in self._detector_instances:
+            logger.info(f"Creating instance of layout engine: {engine_name}")
+            engine_class = self.ENGINE_REGISTRY[engine_name]['class']
+            detector_instance = engine_class() # Instantiate
+            if not detector_instance.is_available():
+                 # Check availability before storing
+                 raise RuntimeError(f"Layout engine '{engine_name}' is not available. Please check dependencies.")
+            self._detector_instances[engine_name] = detector_instance # Store if available
+        return self._detector_instances[engine_name]
+    def analyze_layout(
+        self,
+        image: Image.Image,
+        engine: Optional[str] = None, # Default engine handled below
+        options: Optional[LayoutOptions] = None,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
+        """
+        Analyzes layout of a single image using simple args or an options object.
+        Args:
+            image: The PIL Image to analyze.
+            engine: Name of the engine (e.g., 'yolo', 'tatr'). Ignored if 'options' provided.
+                    Defaults to the first available engine if None.
+            options: Specific LayoutOptions object for advanced configuration.
+            **kwargs: For simple mode, accepts: 'confidence', 'classes',
+                      'exclude_classes', 'device'.
+        Returns:
+            A list of standardized detection dictionaries.
+        """
+        final_options: BaseLayoutOptions
+        selected_engine_name: str
+        if not isinstance(image, Image.Image):
+             raise TypeError("Input 'image' must be a PIL Image.")
+        available_engines = self.get_available_engines()
+        if not available_engines:
+             raise RuntimeError("No layout engines are available. Please check dependencies.")
+        # Determine default engine if not specified
+        default_engine = engine if engine else available_engines[0]
+        # --- Determine Options and Engine ---
+        if options is not None:
+            # Advanced Mode
+            logger.debug(f"LayoutManager: Using advanced mode with options object: {type(options).__name__}")
+            final_options = copy.deepcopy(options) # Use copy
+            found_engine = False
+            for name, registry_entry in self.ENGINE_REGISTRY.items():
+                if isinstance(options, registry_entry['options_class']):
+                    selected_engine_name = name
+                    found_engine = True
+                    break
+            if not found_engine:
+                 raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options.")
+            if kwargs:
+                logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored.")
+        else:
+            # Simple Mode
+            selected_engine_name = default_engine.lower()
+            logger.debug(f"LayoutManager: Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}")
+            if selected_engine_name not in self.ENGINE_REGISTRY:
+                 raise ValueError(f"Unknown or unavailable layout engine: '{selected_engine_name}'. Available: {available_engines}")
+            unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
+            if unexpected_kwargs:
+                raise TypeError(f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration.")
+            options_class = self.ENGINE_REGISTRY[selected_engine_name]['options_class']
+            # Use BaseLayoutOptions defaults unless overridden by kwargs
+            base_defaults = BaseLayoutOptions()
+            simple_args = {
+                'confidence': kwargs.get('confidence', base_defaults.confidence),
+                'classes': kwargs.get('classes'),
+                'exclude_classes': kwargs.get('exclude_classes'),
+                'device': kwargs.get('device', base_defaults.device)
+            }
+            # Filter out None values before passing to constructor
+            simple_args_filtered = {k: v for k, v in simple_args.items() if v is not None}
+            final_options = options_class(**simple_args_filtered)
+            logger.debug(f"LayoutManager: Constructed options for simple mode: {final_options}")
+        # --- Get Engine Instance and Process ---
+        try:
+            engine_instance = self._get_engine_instance(selected_engine_name)
+            logger.info(f"Analyzing layout with engine '{selected_engine_name}'...")
+            # Call the engine's detect method
+            detections = engine_instance.detect(image, final_options)
+            logger.info(f"Layout analysis complete. Found {len(detections)} regions.")
+            return detections
+        except (ImportError, RuntimeError, ValueError, TypeError) as e:
+             logger.error(f"Layout analysis failed for engine '{selected_engine_name}': {e}", exc_info=True)
+             raise # Re-raise expected errors
+        except Exception as e:
+             logger.error(f"An unexpected error occurred during layout analysis: {e}", exc_info=True)
+             raise # Re-raise unexpected errors
+    def get_available_engines(self) -> List[str]:
+        """Returns a list of registered layout engine names that are currently available."""
+        available = []
+        for name, registry_entry in self.ENGINE_REGISTRY.items():
+             try:
+                 engine_class = registry_entry['class']
+                 # Check availability without full instantiation if possible
+                 if hasattr(engine_class, 'is_available') and callable(engine_class.is_available):
+                      # Create temporary instance only for check if needed, or use classmethod
+                      if engine_class().is_available(): # Assumes instance needed for check
+                           available.append(name)
+                 else:
+                      # Assume available if class exists (less robust)
+                      available.append(name)
+             except Exception as e:
+                 logger.debug(f"Layout engine '{name}' check failed: {e}")
+                 pass
+        return available

natural_pdf/analyzers/layout/layout_options.py ADDED Viewed

@@ -0,0 +1,78 @@
+# layout_options.py
+import logging
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Any, Tuple, Union
+logger = logging.getLogger(__name__)
+# --- Base Layout Options ---
+@dataclass
+class BaseLayoutOptions:
+    """Base options for layout detection engines."""
+    confidence: float = 0.5 # Minimum confidence threshold for detections
+    classes: Optional[List[str]] = None # Specific classes to detect (None for all)
+    exclude_classes: Optional[List[str]] = None # Classes to exclude
+    device: Optional[str] = 'cpu' # Preferred device ('cpu', 'cuda', 'mps', etc.)
+    extra_args: Dict[str, Any] = field(default_factory=dict) # For engine-specific args not yet fields
+# --- YOLO Specific Options ---
+@dataclass
+class YOLOLayoutOptions(BaseLayoutOptions):
+    """Options specific to YOLO-based layout detection."""
+    model_repo: str = "juliozhao/DocLayout-YOLO-DocStructBench"
+    model_file: str = "doclayout_yolo_docstructbench_imgsz1024.pt"
+    image_size: int = 1024 # Input image size for the model
+# --- TATR Specific Options ---
+@dataclass
+class TATRLayoutOptions(BaseLayoutOptions):
+    """Options specific to Table Transformer (TATR) layout detection."""
+    # Which models to use (can be local paths or HF identifiers)
+    detection_model: str = "microsoft/table-transformer-detection"
+    structure_model: str = "microsoft/table-transformer-structure-recognition-v1.1-all"
+    # Input image resizing parameters
+    max_detection_size: int = 800
+    max_structure_size: int = 1000
+    # Whether to create cell regions (can be slow)
+    create_cells: bool = False # Keep the flag for cell creation control
+# --- Paddle Specific Options ---
+@dataclass
+class PaddleLayoutOptions(BaseLayoutOptions):
+    """Options specific to PaddlePaddle PP-Structure layout detection."""
+    lang: str = "en" # Language ('en', 'ch', etc.)
+    use_angle_cls: bool = False # Use text angle classification?
+    enable_table: bool = True # Enable table structure detection?
+    show_log: bool = False # Show Paddle internal logs?
+    detect_text: bool = True # Also detect raw text boxes using PaddleOCR?
+    verbose: bool = False # Verbose logging for the detector class
+# --- Surya Specific Options ---
+@dataclass
+class SuryaLayoutOptions(BaseLayoutOptions):
+    """Options specific to Surya layout detection."""
+    # Surya doesn't seem to have many config options based on the example,
+    # but we can add placeholders if needed. Device is handled by BaseLayoutOptions.
+    model_name: str = "default" # Placeholder if different models become available
+    verbose: bool = False # Verbose logging for the detector class
+# --- Docling Specific Options ---
+@dataclass
+class DoclingLayoutOptions(BaseLayoutOptions):
+    """Options specific to Docling layout detection."""
+    # Pass kwargs directly to Docling's DocumentConverter via extra_args
+    # Common examples shown here for documentation, add others as needed to extra_args
+    # model_name: str = "ds4sd/SmolDocling-256M-preview" # Example model (pass via extra_args)
+    # prompt_text: Optional[str] = None # Optional prompt (pass via extra_args)
+    verbose: bool = False # Verbose logging for the detector class
+    # Other kwargs like 'device', 'batch_size' can go in extra_args
+# --- Union Type ---
+LayoutOptions = Union[
+    YOLOLayoutOptions,
+    TATRLayoutOptions,
+    PaddleLayoutOptions,
+    SuryaLayoutOptions,
+    DoclingLayoutOptions,
+    BaseLayoutOptions # Include base for typing flexibility
+]

natural_pdf/analyzers/layout/paddle.py ADDED Viewed

@@ -0,0 +1,240 @@
+# layout_detector_paddle.py
+import logging
+import importlib.util
+import os
+import tempfile
+from typing import List, Dict, Any, Optional
+from PIL import Image
+# Assuming base class and options are importable
+try:
+    from .base import LayoutDetector
+    from .layout_options import PaddleLayoutOptions, BaseLayoutOptions
+except ImportError:
+    # Placeholders if run standalone or imports fail
+    class BaseLayoutOptions: pass
+    class PaddleLayoutOptions(BaseLayoutOptions): pass
+    class LayoutDetector:
+         def __init__(self): self.logger=logging.getLogger(); self.supported_classes=set()
+         def _get_model(self, options): raise NotImplementedError
+         def _normalize_class_name(self, n): return n
+         def validate_classes(self, c): pass
+    logging.basicConfig()
+logger = logging.getLogger(__name__)
+# Check for dependencies
+paddle_spec = importlib.util.find_spec("paddle") or importlib.util.find_spec("paddlepaddle")
+paddleocr_spec = importlib.util.find_spec("paddleocr")
+PPStructure = None
+PaddleOCR = None # For optional text detection
+if paddle_spec and paddleocr_spec:
+    try:
+        from paddleocr import PPStructure, PaddleOCR
+    except ImportError as e:
+        logger.warning(f"Could not import Paddle dependencies: {e}")
+else:
+    logger.warning("paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available.")
+class PaddleLayoutDetector(LayoutDetector):
+    """Document layout and table structure detector using PaddlePaddle's PP-Structure."""
+    def __init__(self):
+        super().__init__()
+        # Supported classes by PP-Structure (adjust based on model version/capabilities)
+        self.supported_classes = {
+            'text', 'title', 'figure', 'figure_caption',
+            'table', 'table_caption', 'table_cell', # Added table_cell
+            'header', 'footer', 'reference', 'equation',
+            # PP-StructureV2 might add others like list, pub_number etc.
+        }
+        # Models are loaded via _get_model
+    def is_available(self) -> bool:
+        """Check if dependencies are installed."""
+        return PPStructure is not None and PaddleOCR is not None
+    def _get_cache_key(self, options: BaseLayoutOptions) -> str:
+        """Generate cache key based on language and device."""
+        if not isinstance(options, PaddleLayoutOptions):
+            options = PaddleLayoutOptions(device=options.device) # Use base device
+        device_key = str(options.device).lower() if options.device else 'default_device'
+        lang_key = options.lang
+        # Key could also include enable_table, use_angle_cls if these affect model loading fundamentally
+        # For PPStructure, they are primarily runtime flags, so lang/device might suffice for caching the *instance*.
+        return f"{self.__class__.__name__}_{device_key}_{lang_key}"
+    def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
+        """Load the PPStructure model based on options."""
+        if not self.is_available():
+             raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
+        if not isinstance(options, PaddleLayoutOptions):
+            raise TypeError("Incorrect options type provided for Paddle model loading.")
+        self.logger.info(f"Loading PPStructure model (lang={options.lang}, device={options.device}, table={options.enable_table})...")
+        try:
+            # PPStructure init takes several arguments that control runtime behavior
+            # We cache the instance based on lang/device, assuming other flags don't require reloading.
+            # Note: show_log is a runtime arg, not needed for instance caching key.
+            # Note: `layout=False` disables layout analysis, which we definitely want here.
+            # Note: `ocr=False` might disable text detection needed for table structure? Check PPStructure docs.
+            # It seems best to initialize with core settings and pass others during the call if possible.
+            # However, PPStructure call signature is simple (__call__(self, img, ...))
+            # So, we likely need to initialize with most settings.
+            model_instance = PPStructure(
+                 lang=options.lang,
+                 use_gpu=('cuda' in str(options.device).lower() or 'gpu' in str(options.device).lower()),
+                 use_angle_cls=options.use_angle_cls,
+                 show_log=options.show_log,
+                 layout=True, # Ensure layout analysis is on
+                 table=options.enable_table, # Control table analysis
+                 ocr=False # Usually disable internal OCR if only using for layout/table
+                 # Add other PPStructure init args from options.extra_args if needed
+                 # **options.extra_args
+            )
+            self.logger.info("PPStructure model loaded.")
+            return model_instance
+        except Exception as e:
+            self.logger.error(f"Failed to load PPStructure model: {e}", exc_info=True)
+            raise
+    def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
+        """Detect layout elements in an image using PaddlePaddle."""
+        if not self.is_available():
+            raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
+        if not isinstance(options, PaddleLayoutOptions):
+             self.logger.warning("Received BaseLayoutOptions, expected PaddleLayoutOptions. Using defaults.")
+             options = PaddleLayoutOptions(
+                 confidence=options.confidence, classes=options.classes,
+                 exclude_classes=options.exclude_classes, device=options.device,
+                 extra_args=options.extra_args
+                 # Other Paddle options will use defaults
+             )
+        self.validate_classes(options.classes or [])
+        if options.exclude_classes:
+            self.validate_classes(options.exclude_classes)
+        # Get the cached/loaded PPStructure instance
+        ppstructure_instance = self._get_model(options)
+        # PPStructure call requires an image path. Save temp file.
+        detections = []
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_image_path = os.path.join(temp_dir, f"paddle_input_{os.getpid()}.png")
+            try:
+                self.logger.debug(f"Saving temporary image for Paddle detector to: {temp_image_path}")
+                image.convert("RGB").save(temp_image_path) # Ensure RGB
+                # Process image with PP-Structure instance
+                # The instance was configured during _load_model_from_options
+                self.logger.debug("Running PPStructure analysis...")
+                result = ppstructure_instance(temp_image_path)
+                self.logger.debug(f"PPStructure returned {len(result)} regions.")
+            except Exception as e:
+                self.logger.error(f"Error during PPStructure analysis: {e}", exc_info=True)
+                # Clean up temp file before raising or returning
+                if os.path.exists(temp_image_path):
+                    try: os.remove(temp_image_path)
+                    except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
+                raise # Re-raise error
+            finally:
+                 # Ensure cleanup even if analysis worked
+                 if os.path.exists(temp_image_path):
+                      try: os.remove(temp_image_path)
+                      except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
+        # --- Process Results ---
+        if not result:
+            self.logger.warning("PaddleLayout returned empty results")
+            return []
+        # Prepare normalized class filters once
+        normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
+        normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
+        for region in result:
+            try:
+                region_type_orig = region.get('type', 'unknown')
+                # Handle potential list returns for type (seen in some versions)
+                if isinstance(region_type_orig, list):
+                     region_type_orig = region_type_orig[0] if region_type_orig else 'unknown'
+                region_type = region_type_orig.lower()
+                normalized_class = self._normalize_class_name(region_type)
+                # Apply class filtering
+                if normalized_classes_req and normalized_class not in normalized_classes_req: continue
+                if normalized_class in normalized_classes_excl: continue
+                # PP-Structure results don't always have confidence, use threshold or default
+                confidence_score = region.get('score', 1.0) # Default to 1.0 if missing
+                if confidence_score < options.confidence: continue
+                bbox = region.get('bbox')
+                if not bbox or len(bbox) != 4:
+                     self.logger.warning(f"Skipping region with invalid bbox: {region}")
+                     continue
+                x_min, y_min, x_max, y_max = map(float, bbox)
+                # Add detection
+                detection_data = {
+                    'bbox': (x_min, y_min, x_max, y_max),
+                    'class': region_type_orig, # Keep original case if needed
+                    'confidence': confidence_score,
+                    'normalized_class': normalized_class,
+                    'source': 'layout',
+                    'model': 'paddle'
+                }
+                detections.append(detection_data)
+                # --- Process Table Cells (if enabled and present) ---
+                if region_type == 'table' and options.enable_table and 'res' in region:
+                    process_cells = (normalized_classes_req is None or 'table-cell' in normalized_classes_req) and \
+                                    ('table-cell' not in normalized_classes_excl)
+                    if process_cells and isinstance(region['res'], list): # V2 structure
+                         for cell in region['res']:
+                              if 'box' not in cell or len(cell['box']) != 4: continue
+                              cell_bbox = cell['box']
+                              cell_x_min, cell_y_min, cell_x_max, cell_y_max = map(float, cell_bbox)
+                              # Add cell detection (confidence often not available per cell)
+                              detections.append({
+                                  'bbox': (cell_x_min, cell_y_min, cell_x_max, cell_y_max),
+                                  'class': 'table cell', # Standardize name
+                                  'confidence': confidence_score * 0.95, # Inherit table confidence (slightly reduced)
+                                  'normalized_class': 'table-cell',
+                                  'text': cell.get('text', ''), # Include text if available
+                                  'source': 'layout', 'model': 'paddle'
+                              })
+                    elif process_cells and isinstance(region['res'], dict) and 'cells' in region['res']: # Older structure
+                         # Handle older 'cells' list if needed (logic from original file)
+                         pass # Add logic based on original paddle.txt if supporting older PP-Structure
+            except (TypeError, KeyError, IndexError, ValueError) as e:
+                self.logger.warning(f"Error processing Paddle region: {region}. Error: {e}")
+                continue
+        # --- Optional: Add Text Boxes from separate OCR run ---
+        if options.detect_text:
+             # This requires another model instance (PaddleOCR) and adds complexity.
+             # Consider if this is truly needed or if layout regions are sufficient.
+             # If needed, implement similar to original paddle.txt:
+             # - Instantiate PaddleOCR (potentially cache separately)
+             # - Run ocr(img_path, det=True, rec=False)
+             # - Process results, adding 'text' class detections
+             self.logger.info("Paddle detect_text=True: Running separate OCR text detection...")
+             # (Implementation omitted for brevity - requires PaddleOCR instance)
+             pass
+        self.logger.info(f"PaddleLayout detected {len(detections)} layout elements matching criteria.")
+        return detections

natural_pdf/analyzers/layout/surya.py ADDED Viewed

@@ -0,0 +1,151 @@
+# layout_detector_surya.py
+import logging
+import importlib.util
+import os
+import tempfile
+from typing import List, Dict, Any, Optional, Tuple
+from PIL import Image
+from .base import LayoutDetector
+from .layout_options import SuryaLayoutOptions, BaseLayoutOptions
+logger = logging.getLogger(__name__)
+# Check for dependency
+surya_spec = importlib.util.find_spec("surya")
+LayoutPredictor = None
+if surya_spec:
+    try:
+        from surya.layout import LayoutPredictor
+    except ImportError as e:
+        logger.warning(f"Could not import Surya dependencies: {e}")
+else:
+    logger.warning("surya not found. SuryaLayoutDetector will not be available.")
+class SuryaLayoutDetector(LayoutDetector):
+    """Document layout detector using Surya models."""
+    def __init__(self):
+        super().__init__()
+        self.supported_classes = {
+            'text', 'pageheader', 'pagefooter', 'sectionheader',
+            'table', 'tableofcontents', 'picture', 'caption',
+            'heading', 'title', 'list', 'listitem', 'code',
+            'textinlinemath', 'mathformula', 'form'
+        }
+        # Predictor instance is cached via _get_model
+    def is_available(self) -> bool:
+        """Check if surya is installed."""
+        return LayoutPredictor is not None
+    def _get_cache_key(self, options: BaseLayoutOptions) -> str:
+        """Generate cache key based on model name and device."""
+        if not isinstance(options, SuryaLayoutOptions):
+             options = SuryaLayoutOptions(device=options.device) # Use base device
+        device_key = str(options.device).lower() if options.device else 'default_device'
+        # Include model_name if it affects loading, otherwise device might be enough
+        model_key = options.model_name
+        return f"{self.__class__.__name__}_{device_key}_{model_key}"
+    def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
+        """Load the Surya LayoutPredictor model."""
+        if not self.is_available():
+            raise RuntimeError("Surya dependency (surya-ocr) not installed.")
+        if not isinstance(options, SuryaLayoutOptions):
+            raise TypeError("Incorrect options type provided for Surya model loading.")
+        self.logger.info(f"Loading Surya LayoutPredictor (device={options.device})...")
+        try:
+            # Pass device and potentially other init args from options.extra_args
+            predictor_args = {'device': options.device} if options.device else {}
+            predictor_args.update(options.extra_args) # Add any extra init args
+            predictor = LayoutPredictor(**predictor_args)
+            self.logger.info("Surya LayoutPredictor loaded.")
+            return predictor
+        except Exception as e:
+            self.logger.error(f"Failed to load Surya LayoutPredictor: {e}", exc_info=True)
+            raise
+    def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
+        """Detect layout elements in an image using Surya."""
+        if not self.is_available():
+            raise RuntimeError("Surya dependency (surya-ocr) not installed.")
+        if not isinstance(options, SuryaLayoutOptions):
+             self.logger.warning("Received BaseLayoutOptions, expected SuryaLayoutOptions. Using defaults.")
+             options = SuryaLayoutOptions(
+                 confidence=options.confidence, classes=options.classes,
+                 exclude_classes=options.exclude_classes, device=options.device,
+                 extra_args=options.extra_args
+             )
+        self.validate_classes(options.classes or [])
+        if options.exclude_classes:
+            self.validate_classes(options.exclude_classes)
+        # Get the cached/loaded predictor instance
+        layout_predictor = self._get_model(options)
+        # Surya predictor takes a list of images
+        input_image_list = [image.convert("RGB")] # Ensure RGB
+        detections = []
+        try:
+            self.logger.debug("Running Surya layout prediction...")
+            # Call the predictor (returns a list of LayoutResult objects)
+            layout_predictions = layout_predictor(input_image_list)
+            self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
+            if not layout_predictions:
+                self.logger.warning("Surya returned empty predictions list.")
+                return []
+            # Process results for the first (and only) image
+            prediction = layout_predictions[0] # LayoutResult object
+            # Prepare normalized class filters once
+            normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
+            normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
+            for layout_box in prediction.bboxes:
+                # Extract the class name and normalize it
+                class_name_orig = layout_box.label
+                normalized_class = self._normalize_class_name(class_name_orig)
+                score = float(layout_box.confidence)
+                # Apply confidence threshold
+                if score < options.confidence: continue
+                # Apply class filtering
+                if normalized_classes_req and normalized_class not in normalized_classes_req: continue
+                if normalized_class in normalized_classes_excl: continue
+                # Extract bbox coordinates (Surya provides [x_min, y_min, x_max, y_max])
+                x_min, y_min, x_max, y_max = map(float, layout_box.bbox)
+                # Add detection
+                detection_data = {
+                    'bbox': (x_min, y_min, x_max, y_max),
+                    'class': class_name_orig,
+                    'confidence': score,
+                    'normalized_class': normalized_class,
+                    'source': 'layout',
+                    'model': 'surya'
+                    # Add polygon etc. if needed, check attributes on layout_box
+                    # 'polygon': layout_box.polygon if hasattr(layout_box, 'polygon') else None,
+                }
+                detections.append(detection_data)
+            self.logger.info(f"Surya detected {len(detections)} layout elements matching criteria.")
+        except Exception as e:
+            self.logger.error(f"Error during Surya layout detection: {e}", exc_info=True)
+            raise
+        return detections