PyPI - natural-pdf - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +209 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +288 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +413 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +512 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +604 -0
docs/tutorials/12-ocr-integration.md +175 -0
docs/tutorials/13-semantic-search.ipynb +1328 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +50 -33
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/gemini.py +264 -0
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +125 -58
natural_pdf/analyzers/layout/layout_options.py +43 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +89 -45
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +146 -97
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +419 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +1044 -521
natural_pdf/core/pdf.py +516 -313
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +307 -225
natural_pdf/elements/collections.py +805 -543
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +889 -879
natural_pdf/elements/text.py +127 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +57 -35
natural_pdf/ocr/engine.py +150 -46
natural_pdf/ocr/engine_easyocr.py +146 -150
natural_pdf/ocr/engine_paddle.py +118 -175
natural_pdf/ocr/engine_surya.py +78 -141
natural_pdf/ocr/ocr_factory.py +114 -0
natural_pdf/ocr/ocr_manager.py +122 -124
natural_pdf/ocr/ocr_options.py +16 -20
natural_pdf/ocr/utils.py +98 -0
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/templates/spa/css/style.css +334 -0
natural_pdf/templates/spa/index.html +31 -0
natural_pdf/templates/spa/js/app.js +472 -0
natural_pdf/templates/spa/words.txt +235976 -0
natural_pdf/utils/debug.py +32 -0
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/identifiers.py +29 -0
natural_pdf/utils/packaging.py +418 -0
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
natural_pdf-0.1.6.dist-info/RECORD +141 -0
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
natural_pdf/templates/ocr_debug.html +0 -517
natural_pdf-0.1.4.dist-info/RECORD +0 -61
natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0

natural_pdf/ocr/engine.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # ocr_engine_base.py
 import logging
 from abc import ABC, abstractmethod
-from typing import Dict, List, Any, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 from PIL import Image
 # Assuming ocr_options defines BaseOCROptions
@@ -9,35 +10,138 @@ from .ocr_options import BaseOCROptions
 logger = logging.getLogger(__name__)
+class TextRegion:
+    """Standard representation of an OCR text region."""
+    def __init__(self, bbox: Tuple[float, float, float, float], text: str, confidence: float, source: str = "ocr"):
+        """
+        Initialize a text region.
+        Args:
+            bbox: Tuple of (x0, y0, x1, y1) coordinates
+            text: The recognized text
+            confidence: Confidence score (0.0-1.0)
+            source: Source of the text region (default: "ocr")
+        """
+        self.bbox = bbox
+        self.text = text
+        self.confidence = confidence
+        self.source = source
+    @classmethod
+    def from_polygon(cls, polygon: List[List[float]], text: str, confidence: float):
+        """Create from polygon coordinates [[x1,y1], [x2,y2], ...]"""
+        x_coords = [float(point[0]) for point in polygon]
+        y_coords = [float(point[1]) for point in polygon]
+        bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
+        return cls(bbox, text, confidence)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary representation for compatibility."""
+        return {
+            "bbox": self.bbox,
+            "text": self.text,
+            "confidence": self.confidence,
+            "source": self.source
+        }
 class OCREngine(ABC):
     """Abstract Base Class for OCR engines."""
+    # Default values as class constants
+    DEFAULT_MIN_CONFIDENCE = 0.2
+    DEFAULT_LANGUAGES = ['en']
+    DEFAULT_DEVICE = 'cpu'
     def __init__(self):
         """Initializes the base OCR engine."""
         self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
         self.logger.info(f"Initializing {self.__class__.__name__}")
-        self._reader_cache = {} # Cache for initialized models/readers
+        self._model = None
+        self._initialized = False
+        self._reader_cache = {}  # Cache for initialized models/readers
-    @abstractmethod
     def process_image(
         self,
-        images: Union[Image.Image, List[Image.Image]], # Accept single or list
-        options: BaseOCROptions
-    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
+        images: Union[Image.Image, List[Image.Image]],
+        languages: Optional[List[str]] = None,
+        min_confidence: Optional[float] = None,
+        device: Optional[str] = None,
+        detect_only: bool = False,
+        options: Optional[BaseOCROptions] = None,
+    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
         """
-        Processes a single image or a batch of images using the specific engine and options.
+        Process a single image or batch of images with OCR.
         Args:
-            images: A single PIL Image or a list of PIL Images.
-            options: An instance of a dataclass inheriting from BaseOCROptions
-                     containing configuration for this run.
+            images: A single PIL Image or a list of PIL Images
+            languages: List of languages to use (default: ['en'])
+            min_confidence: Minimum confidence threshold (default: 0.2)
+            device: Device to use for processing (default: 'cpu')
+            detect_only: Whether to only detect text regions without recognition
+            options: Engine-specific options
         Returns:
-            If input is a single image: List of result dictionaries.
-            If input is a list of images: List of lists of result dictionaries,
-                                          corresponding to each input image.
-                                          An empty list indicates failure for that image.
+            For a single image: List of text region dictionaries
+            For a batch: List of lists of text region dictionaries
         """
+        # Convert single image to batch format
+        single_image = not isinstance(images, list)
+        image_batch = [images] if single_image else images
+        # Use default values where parameters are not provided
+        effective_languages = languages or self.DEFAULT_LANGUAGES
+        effective_confidence = min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
+        effective_device = device or self.DEFAULT_DEVICE
+        # Ensure the model is initialized
+        self._ensure_initialized(effective_languages, effective_device, options)
+        # Process each image in the batch
+        results = []
+        for img in image_batch:
+            # Preprocess the image for the specific engine
+            processed_img = self._preprocess_image(img)
+            # Process the image with the engine-specific implementation
+            raw_results = self._process_single_image(processed_img, detect_only, options)
+            # Convert results to standardized format
+            text_regions = self._standardize_results(raw_results, effective_confidence, detect_only)
+            # Convert TextRegion objects to dictionaries for backward compatibility
+            region_dicts = [region.to_dict() for region in text_regions]
+            results.append(region_dicts)
+        # Return results in the appropriate format
+        return results[0] if single_image else results
+    def _ensure_initialized(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+        """Ensure the model is initialized with the correct parameters."""
+        if not self._initialized:
+            self._initialize_model(languages, device, options)
+            self._initialized = True
+    @abstractmethod
+    def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+        """Initialize the OCR model with the given parameters."""
+        raise NotImplementedError("Subclasses must implement this method")
+    @abstractmethod
+    def _preprocess_image(self, image: Image.Image) -> Any:
+        """Convert PIL Image to engine-specific format."""
+        raise NotImplementedError("Subclasses must implement this method")
+    @abstractmethod
+    def _process_single_image(self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]) -> Any:
+        """Process a single image with the initialized model."""
+        raise NotImplementedError("Subclasses must implement this method")
+    @abstractmethod
+    def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
+        """Convert engine-specific results to standardized TextRegion objects."""
         raise NotImplementedError("Subclasses must implement this method")
     @abstractmethod
@@ -61,44 +165,44 @@ class OCREngine(ABC):
         Returns:
             A string cache key.
         """
-        # Basic key includes languages and device
-        lang_key = "-".join(sorted(options.languages))
-        device_key = str(options.device).lower()
+        lang_key = "-".join(sorted(getattr(options, "languages", self.DEFAULT_LANGUAGES)))
+        device_key = str(getattr(options, "device", self.DEFAULT_DEVICE)).lower()
         return f"{self.__class__.__name__}_{lang_key}_{device_key}"
-    def _standardize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]:
-        """
-        Helper to standardize bounding boxes to (x0, y0, x1, y1) format.
-        Args:
-            bbox: The bounding box in the engine's native format.
-                  Expected formats:
-                  - List/Tuple of 4 numbers: (x0, y0, x1, y1)
-                  - List of points: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (polygon)
-        Returns:
-            Tuple[float, float, float, float] or None if conversion fails.
-        """
-        try:
-            if isinstance(bbox, (list, tuple)) and len(bbox) == 4 and all(isinstance(n, (int, float)) for n in bbox):
-                # Already in (x0, y0, x1, y1) format (or similar)
+    def _standardize_bbox(self, bbox: Any) -> Tuple[float, float, float, float]:
+        """Standardizes bounding boxes to (x0, y0, x1, y1) format. Raises ValueError if standardization fails."""
+        # Check if it's already in the correct tuple/list format
+        if (
+            isinstance(bbox, (list, tuple))
+            and len(bbox) == 4
+            and all(isinstance(n, (int, float)) for n in bbox)
+        ):
+            try:
                 return tuple(float(c) for c in bbox[:4])
-            elif isinstance(bbox, (list, tuple)) and len(bbox) > 0 and isinstance(bbox[0], (list, tuple)):
-                # Polygon format [[x1,y1],[x2,y2],...]
+            except (ValueError, TypeError) as e:
+                raise ValueError(f"Invalid number format in bbox: {bbox}") from e
+        # Check if it's in polygon format [[x1,y1],[x2,y2],...]
+        elif (
+            isinstance(bbox, (list, tuple))
+            and len(bbox) > 0
+            and isinstance(bbox[0], (list, tuple))
+            and len(bbox[0]) == 2 # Ensure points are pairs
+        ):
+            try:
                 x_coords = [float(point[0]) for point in bbox]
                 y_coords = [float(point[1]) for point in bbox]
-                x0 = min(x_coords)
-                y0 = min(y_coords)
-                x1 = max(x_coords)
-                y1 = max(y_coords)
-                return (x0, y0, x1, y1)
-        except Exception as e:
-            self.logger.warning(f"Could not standardize bounding box: {bbox}. Error: {e}")
-        return None
+                if not x_coords or not y_coords: # Handle empty polygon case
+                    raise ValueError("Empty polygon provided")
+                return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
+            except (ValueError, TypeError, IndexError) as e:
+                raise ValueError(f"Invalid polygon format or values: {bbox}") from e
+        # If it's neither format, raise an error
+        raise ValueError(f"Could not standardize bounding box from unexpected format: {bbox}")
     def __del__(self):
         """Cleanup resources when the engine is deleted."""
         self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
         # Clear reader cache to free up memory/GPU resources
         self._reader_cache.clear()

natural_pdf/ocr/engine_easyocr.py CHANGED Viewed

@@ -1,179 +1,175 @@
 # ocr_engine_easyocr.py
-import logging
 import importlib.util
-from typing import Dict, List, Any, Optional, Tuple, Union
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 from PIL import Image
-import inspect # Used for dynamic parameter passing
-from .engine import OCREngine
-from .ocr_options import EasyOCROptions, BaseOCROptions
+from .engine import OCREngine, TextRegion
+from .ocr_options import BaseOCROptions, EasyOCROptions
 logger = logging.getLogger(__name__)
 class EasyOCREngine(OCREngine):
     """EasyOCR engine implementation."""
     def __init__(self):
         super().__init__()
-        self._easyocr = None # Lazy load easyocr module
-    def _lazy_import_easyocr(self):
-        """Imports easyocr only when needed."""
-        if self._easyocr is None:
-            if not self.is_available():
-                raise ImportError("EasyOCR is not installed or available.")
-            try:
-                import easyocr
-                self._easyocr = easyocr
-                logger.info("EasyOCR module imported successfully.")
-            except ImportError as e:
-                logger.error(f"Failed to import EasyOCR: {e}")
-                raise
-        return self._easyocr
+        # No longer need _easyocr attribute
+        # self._easyocr = None
     def is_available(self) -> bool:
         """Check if EasyOCR is installed."""
         return importlib.util.find_spec("easyocr") is not None
-    def _get_cache_key(self, options: EasyOCROptions) -> str:
-        """Generate a more specific cache key for EasyOCR."""
-        base_key = super()._get_cache_key(options)
-        recog_key = options.recog_network
-        detect_key = options.detect_network
-        quantize_key = str(options.quantize)
-        return f"{base_key}_{recog_key}_{detect_key}_{quantize_key}"
-    def _get_reader(self, options: EasyOCROptions):
-        """Get or initialize an EasyOCR reader based on options."""
-        cache_key = self._get_cache_key(options)
-        if cache_key in self._reader_cache:
-            logger.debug(f"Using cached EasyOCR reader for key: {cache_key}")
-            return self._reader_cache[cache_key]
-        logger.info(f"Creating new EasyOCR reader for key: {cache_key}")
-        easyocr = self._lazy_import_easyocr()
-        constructor_sig = inspect.signature(easyocr.Reader.__init__)
-        constructor_args = {}
-        constructor_args['lang_list'] = options.languages
-        constructor_args['gpu'] = 'cuda' in str(options.device).lower() or 'mps' in str(options.device).lower()
-        for field_name, param in constructor_sig.parameters.items():
-            if field_name in ['self', 'lang_list', 'gpu']: continue
-            if hasattr(options, field_name):
-                 constructor_args[field_name] = getattr(options, field_name)
-            elif field_name in options.extra_args:
-                 constructor_args[field_name] = options.extra_args[field_name]
-        logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
+    def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+        """Initialize the EasyOCR model."""
+        # Import directly here
         try:
-            reader = easyocr.Reader(**constructor_args)
-            self._reader_cache[cache_key] = reader
-            logger.info("EasyOCR reader created successfully.")
-            return reader
+            import easyocr
+            self.logger.info("EasyOCR module imported successfully.")
+        except ImportError as e:
+            self.logger.error(f"Failed to import EasyOCR: {e}")
+            raise
+        # Cast to EasyOCROptions if possible, otherwise use default
+        easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
+        # Prepare constructor arguments
+        use_gpu = "cuda" in device.lower() or "mps" in device.lower()
+        constructor_args = {
+            "lang_list": languages,
+            "gpu": use_gpu,
+            # Explicitly map relevant options
+            "model_storage_directory": easy_options.model_storage_directory,
+            "user_network_directory": easy_options.user_network_directory,
+            "recog_network": easy_options.recog_network,
+            "detect_network": easy_options.detect_network,
+            "download_enabled": easy_options.download_enabled,
+            "detector": easy_options.detector,
+            "recognizer": easy_options.recognizer,
+            "verbose": easy_options.verbose,
+            "quantize": easy_options.quantize,
+            "cudnn_benchmark": easy_options.cudnn_benchmark,
+        }
+        # Filter out None values, as EasyOCR expects non-None or default behaviour
+        constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
+        self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
+        # Create the reader
+        try:
+            self._model = easyocr.Reader(**constructor_args)
+            self.logger.info("EasyOCR reader created successfully")
         except Exception as e:
-            logger.error(f"Failed to create EasyOCR reader: {e}", exc_info=True)
+            self.logger.error(f"Failed to create EasyOCR reader: {e}")
             raise
-    def _prepare_readtext_args(self, options: EasyOCROptions, reader) -> Dict[str, Any]:
-        """Helper to prepare arguments for the readtext method."""
-        readtext_sig = inspect.signature(reader.readtext)
+    def _preprocess_image(self, image: Image.Image) -> np.ndarray:
+        """Convert PIL Image to numpy array for EasyOCR."""
+        return np.array(image)
+    def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]) -> Any:
+        """Process a single image with EasyOCR."""
+        if self._model is None:
+            raise RuntimeError("EasyOCR model not initialized")
+        # Cast options to proper type if provided
+        easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
+        # Prepare readtext arguments (only needed if not detect_only)
         readtext_args = {}
-        for field_name, param in readtext_sig.parameters.items():
-             if field_name == 'image': continue
-             if hasattr(options, field_name):
-                 readtext_args[field_name] = getattr(options, field_name)
-             elif field_name in options.extra_args:
-                 readtext_args[field_name] = options.extra_args[field_name]
-        logger.debug(f"EasyOCR readtext args: {readtext_args}")
-        return readtext_args
-    def _standardize_results(self, raw_results: List[Any], options: EasyOCROptions) -> List[Dict[str, Any]]:
-        """Standardizes raw results from EasyOCR's readtext."""
-        standardized_results = []
-        min_confidence = options.min_confidence
+        if not detect_only:
+            for param in [
+                "detail", "paragraph", "min_size", "contrast_ths", "adjust_contrast",
+                "filter_ths", "text_threshold", "low_text", "link_threshold",
+                "canvas_size", "mag_ratio", "slope_ths", "ycenter_ths", "height_ths",
+                "width_ths", "y_ths", "x_ths", "add_margin", "output_format"
+            ]:
+                if hasattr(easy_options, param):
+                    val = getattr(easy_options, param)
+                    if val is not None:
+                        readtext_args[param] = val
+        # Process differently based on detect_only flag
+        if detect_only:
+            # Returns tuple (horizontal_list, free_list)
+            # horizontal_list is a list containing one item: the list of boxes
+            # Each box is [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
+            bboxes_tuple = self._model.detect(image, **readtext_args) # Pass args here too? Check EasyOCR docs if needed.
+            if bboxes_tuple and isinstance(bboxes_tuple, tuple) and len(bboxes_tuple) > 0 and isinstance(bboxes_tuple[0], list):
+                return bboxes_tuple[0] # Return the list of polygons directly
+            else:
+                self.logger.warning(f"EasyOCR detect returned unexpected format: {bboxes_tuple}")
+                return [] # Return empty list on unexpected format
+        else:
+            return self._model.readtext(image, **readtext_args)
+    def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
+        """Convert EasyOCR results to standardized TextRegion objects."""
+        standardized_regions = []
+        if detect_only:
+            # In detect_only mode, raw_results is already a list of bounding boxes
+            # Each bbox is in [x_min, x_max, y_min, y_max] format
+            if isinstance(raw_results, list):
+                for detection in raw_results:
+                    try:
+                        if isinstance(detection, (list, tuple)) and len(detection) == 4:
+                             x_min, x_max, y_min, y_max = detection
+                             # Convert to standardized (x0, y0, x1, y1) format
+                             try:
+                                 bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
+                                 standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
+                             except (ValueError, TypeError) as e:
+                                 raise ValueError(f"Invalid number format in EasyOCR detect bbox: {detection}") from e
+                        else:
+                            raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
+                    except ValueError as e:
+                        # Re-raise any value errors from standardization or format checks
+                        raise e
+                    except Exception as e:
+                        # Catch other potential processing errors
+                        raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
+            else:
+                raise ValueError(f"Expected list of bounding boxes in detect_only mode, got: {raw_results}")
+            return standardized_regions
+        # Full OCR mode (readtext results)
         for detection in raw_results:
             try:
-                if options.detail == 1 and isinstance(detection, (list, tuple)) and len(detection) >= 3:
-                    bbox_raw = detection[0]
+                # Detail mode (list/tuple result)
+                if isinstance(detection, (list, tuple)) and len(detection) >= 3:
+                    bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
                     text = str(detection[1])
                     confidence = float(detection[2])
                     if confidence >= min_confidence:
-                        bbox = self._standardize_bbox(bbox_raw)
-                        if bbox:
-                            standardized_results.append({
-                                'bbox': bbox, 'text': text, 'confidence': confidence, 'source': 'ocr'
-                            })
-                        else:
-                             logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
-                elif options.detail == 0 and isinstance(detection, str):
-                     standardized_results.append({
-                         'bbox': None, 'text': detection, 'confidence': 1.0, 'source': 'ocr'
-                     })
-            except (IndexError, ValueError, TypeError) as e:
-                 logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
-                 continue
-        return standardized_results
-    def process_image(
-        self,
-        images: Union[Image.Image, List[Image.Image]],
-        options: BaseOCROptions
-    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
-        """Processes a single image or a batch of images with EasyOCR."""
-        if not isinstance(options, EasyOCROptions):
-             logger.warning("Received BaseOCROptions, expected EasyOCROptions. Using defaults.")
-             # Create default EasyOCR options if base was passed, preserving base settings
-             options = EasyOCROptions(
-                 languages=options.languages,
-                 min_confidence=options.min_confidence,
-                 device=options.device,
-                 extra_args=options.extra_args # Pass along any extra args
-             )
-        reader = self._get_reader(options)
-        readtext_args = self._prepare_readtext_args(options, reader)
-        # --- Handle single image or batch ---
-        if isinstance(images, list):
-            # --- Batch Processing (Iterative for EasyOCR) ---
-            all_results = []
-            logger.info(f"Processing batch of {len(images)} images with EasyOCR (iteratively)...")
-            for i, img in enumerate(images):
-                if not isinstance(img, Image.Image):
-                     logger.warning(f"Item at index {i} in batch is not a PIL Image. Skipping.")
-                     all_results.append([])
-                     continue
-                img_array = np.array(img)
-                try:
-                    logger.debug(f"Processing image {i+1}/{len(images)} in batch.")
-                    raw_results = reader.readtext(img_array, **readtext_args)
-                    standardized = self._standardize_results(raw_results, options)
-                    all_results.append(standardized)
-                except Exception as e:
-                    logger.error(f"Error processing image {i+1} in EasyOCR batch: {e}", exc_info=True)
-                    all_results.append([]) # Append empty list for failed image
-            logger.info(f"Finished processing batch with EasyOCR.")
-            return all_results # Return List[List[Dict]]
-        elif isinstance(images, Image.Image):
-            # --- Single Image Processing ---
-            logger.info("Processing single image with EasyOCR...")
-            img_array = np.array(images)
-            try:
-                raw_results = reader.readtext(img_array, **readtext_args)
-                standardized = self._standardize_results(raw_results, options)
-                logger.info(f"Finished processing single image. Found {len(standardized)} results.")
-                return standardized # Return List[Dict]
+                        try:
+                            # Use the standard helper for polygons
+                            bbox = self._standardize_bbox(bbox_raw)
+                            standardized_regions.append(TextRegion(bbox, text, confidence))
+                        except ValueError as e:
+                            raise ValueError(f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}") from e
+                # Simple mode (string result)
+                elif isinstance(detection, str):
+                    if 0.0 >= min_confidence:  # Always include if min_confidence is 0
+                        standardized_regions.append(TextRegion((0, 0, 0, 0), detection, 1.0))
+                else:
+                    # Handle unexpected format in OCR mode
+                    raise ValueError(f"Invalid OCR detection format from EasyOCR readtext: {detection}")
+            except ValueError as e:
+                # Re-raise any value errors from standardization or format checks
+                raise e
             except Exception as e:
-                logger.error(f"Error processing single image with EasyOCR: {e}", exc_info=True)
-                return [] # Return empty list on failure
-        else:
-            raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
+                # Catch other potential processing errors
+                raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
+        return standardized_regions

natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl