PyPI - natural-pdf - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

natural-pdf 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

docs/finetuning/index.md +176 -0
docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/gemini.py +63 -47
natural_pdf/collections/pdf_collection.py +5 -2
natural_pdf/core/element_manager.py +6 -4
natural_pdf/core/page.py +36 -27
natural_pdf/core/pdf.py +25 -16
natural_pdf/elements/base.py +1 -3
natural_pdf/elements/collections.py +13 -14
natural_pdf/elements/region.py +7 -6
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/ocr/__init__.py +16 -8
natural_pdf/ocr/engine.py +46 -30
natural_pdf/ocr/engine_easyocr.py +81 -40
natural_pdf/ocr/engine_paddle.py +39 -28
natural_pdf/ocr/engine_surya.py +32 -16
natural_pdf/ocr/ocr_factory.py +34 -23
natural_pdf/ocr/ocr_manager.py +15 -11
natural_pdf/ocr/ocr_options.py +5 -0
natural_pdf/ocr/utils.py +46 -31
natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
natural_pdf/utils/debug.py +4 -2
natural_pdf/utils/identifiers.py +9 -5
natural_pdf/utils/packaging.py +172 -105
natural_pdf/utils/text_extraction.py +44 -64
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0

natural_pdf/ocr/engine.py CHANGED Viewed

@@ -13,11 +13,17 @@ logger = logging.getLogger(__name__)
 class TextRegion:
     """Standard representation of an OCR text region."""
-    def __init__(self, bbox: Tuple[float, float, float, float], text: str, confidence: float, source: str = "ocr"):
+    def __init__(
+        self,
+        bbox: Tuple[float, float, float, float],
+        text: str,
+        confidence: float,
+        source: str = "ocr",
+    ):
         """
         Initialize a text region.
         Args:
             bbox: Tuple of (x0, y0, x1, y1) coordinates
             text: The recognized text
@@ -28,7 +34,7 @@ class TextRegion:
         self.text = text
         self.confidence = confidence
         self.source = source
     @classmethod
     def from_polygon(cls, polygon: List[List[float]], text: str, confidence: float):
         """Create from polygon coordinates [[x1,y1], [x2,y2], ...]"""
@@ -36,24 +42,24 @@ class TextRegion:
         y_coords = [float(point[1]) for point in polygon]
         bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
         return cls(bbox, text, confidence)
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary representation for compatibility."""
         return {
             "bbox": self.bbox,
             "text": self.text,
             "confidence": self.confidence,
-            "source": self.source
+            "source": self.source,
         }
 class OCREngine(ABC):
     """Abstract Base Class for OCR engines."""
     # Default values as class constants
     DEFAULT_MIN_CONFIDENCE = 0.2
-    DEFAULT_LANGUAGES = ['en']
-    DEFAULT_DEVICE = 'cpu'
+    DEFAULT_LANGUAGES = ["en"]
+    DEFAULT_DEVICE = "cpu"
     def __init__(self):
         """Initializes the base OCR engine."""
@@ -74,7 +80,7 @@ class OCREngine(ABC):
     ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
         """
         Process a single image or batch of images with OCR.
         Args:
             images: A single PIL Image or a list of PIL Images
             languages: List of languages to use (default: ['en'])
@@ -82,7 +88,7 @@ class OCREngine(ABC):
             device: Device to use for processing (default: 'cpu')
             detect_only: Whether to only detect text regions without recognition
             options: Engine-specific options
         Returns:
             For a single image: List of text region dictionaries
             For a batch: List of lists of text region dictionaries
@@ -90,42 +96,48 @@ class OCREngine(ABC):
         # Convert single image to batch format
         single_image = not isinstance(images, list)
         image_batch = [images] if single_image else images
         # Use default values where parameters are not provided
         effective_languages = languages or self.DEFAULT_LANGUAGES
-        effective_confidence = min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
+        effective_confidence = (
+            min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
+        )
         effective_device = device or self.DEFAULT_DEVICE
         # Ensure the model is initialized
         self._ensure_initialized(effective_languages, effective_device, options)
         # Process each image in the batch
         results = []
         for img in image_batch:
             # Preprocess the image for the specific engine
             processed_img = self._preprocess_image(img)
             # Process the image with the engine-specific implementation
             raw_results = self._process_single_image(processed_img, detect_only, options)
             # Convert results to standardized format
             text_regions = self._standardize_results(raw_results, effective_confidence, detect_only)
             # Convert TextRegion objects to dictionaries for backward compatibility
             region_dicts = [region.to_dict() for region in text_regions]
             results.append(region_dicts)
         # Return results in the appropriate format
         return results[0] if single_image else results
-    def _ensure_initialized(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+    def _ensure_initialized(
+        self, languages: List[str], device: str, options: Optional[BaseOCROptions]
+    ):
         """Ensure the model is initialized with the correct parameters."""
         if not self._initialized:
             self._initialize_model(languages, device, options)
             self._initialized = True
     @abstractmethod
-    def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+    def _initialize_model(
+        self, languages: List[str], device: str, options: Optional[BaseOCROptions]
+    ):
         """Initialize the OCR model with the given parameters."""
         raise NotImplementedError("Subclasses must implement this method")
@@ -133,14 +145,18 @@ class OCREngine(ABC):
     def _preprocess_image(self, image: Image.Image) -> Any:
         """Convert PIL Image to engine-specific format."""
         raise NotImplementedError("Subclasses must implement this method")
     @abstractmethod
-    def _process_single_image(self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]) -> Any:
+    def _process_single_image(
+        self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]
+    ) -> Any:
         """Process a single image with the initialized model."""
         raise NotImplementedError("Subclasses must implement this method")
     @abstractmethod
-    def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
+    def _standardize_results(
+        self, raw_results: Any, min_confidence: float, detect_only: bool
+    ) -> List[TextRegion]:
         """Convert engine-specific results to standardized TextRegion objects."""
         raise NotImplementedError("Subclasses must implement this method")
@@ -181,23 +197,23 @@ class OCREngine(ABC):
                 return tuple(float(c) for c in bbox[:4])
             except (ValueError, TypeError) as e:
                 raise ValueError(f"Invalid number format in bbox: {bbox}") from e
         # Check if it's in polygon format [[x1,y1],[x2,y2],...]
         elif (
             isinstance(bbox, (list, tuple))
             and len(bbox) > 0
             and isinstance(bbox[0], (list, tuple))
-            and len(bbox[0]) == 2 # Ensure points are pairs
+            and len(bbox[0]) == 2  # Ensure points are pairs
         ):
             try:
                 x_coords = [float(point[0]) for point in bbox]
                 y_coords = [float(point[1]) for point in bbox]
-                if not x_coords or not y_coords: # Handle empty polygon case
+                if not x_coords or not y_coords:  # Handle empty polygon case
                     raise ValueError("Empty polygon provided")
                 return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
             except (ValueError, TypeError, IndexError) as e:
                 raise ValueError(f"Invalid polygon format or values: {bbox}") from e
         # If it's neither format, raise an error
         raise ValueError(f"Could not standardize bounding box from unexpected format: {bbox}")

natural_pdf/ocr/engine_easyocr.py CHANGED Viewed

@@ -18,28 +18,31 @@ class EasyOCREngine(OCREngine):
     def __init__(self):
         super().__init__()
         # No longer need _easyocr attribute
-        # self._easyocr = None
+        # self._easyocr = None
     def is_available(self) -> bool:
         """Check if EasyOCR is installed."""
         return importlib.util.find_spec("easyocr") is not None
-    def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+    def _initialize_model(
+        self, languages: List[str], device: str, options: Optional[BaseOCROptions]
+    ):
         """Initialize the EasyOCR model."""
         # Import directly here
         try:
             import easyocr
             self.logger.info("EasyOCR module imported successfully.")
         except ImportError as e:
             self.logger.error(f"Failed to import EasyOCR: {e}")
             raise
         # Cast to EasyOCROptions if possible, otherwise use default
         easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
         # Prepare constructor arguments
         use_gpu = "cuda" in device.lower() or "mps" in device.lower()
         constructor_args = {
             "lang_list": languages,
             "gpu": use_gpu,
@@ -55,12 +58,12 @@ class EasyOCREngine(OCREngine):
             "quantize": easy_options.quantize,
             "cudnn_benchmark": easy_options.cudnn_benchmark,
         }
         # Filter out None values, as EasyOCR expects non-None or default behaviour
         constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
         self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
         # Create the reader
         try:
             self._model = easyocr.Reader(**constructor_args)
@@ -73,46 +76,72 @@ class EasyOCREngine(OCREngine):
         """Convert PIL Image to numpy array for EasyOCR."""
         return np.array(image)
-    def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]) -> Any:
+    def _process_single_image(
+        self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]
+    ) -> Any:
         """Process a single image with EasyOCR."""
         if self._model is None:
             raise RuntimeError("EasyOCR model not initialized")
         # Cast options to proper type if provided
         easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
         # Prepare readtext arguments (only needed if not detect_only)
         readtext_args = {}
         if not detect_only:
             for param in [
-                "detail", "paragraph", "min_size", "contrast_ths", "adjust_contrast",
-                "filter_ths", "text_threshold", "low_text", "link_threshold",
-                "canvas_size", "mag_ratio", "slope_ths", "ycenter_ths", "height_ths",
-                "width_ths", "y_ths", "x_ths", "add_margin", "output_format"
+                "detail",
+                "paragraph",
+                "min_size",
+                "contrast_ths",
+                "adjust_contrast",
+                "filter_ths",
+                "text_threshold",
+                "low_text",
+                "link_threshold",
+                "canvas_size",
+                "mag_ratio",
+                "slope_ths",
+                "ycenter_ths",
+                "height_ths",
+                "width_ths",
+                "y_ths",
+                "x_ths",
+                "add_margin",
+                "output_format",
             ]:
                 if hasattr(easy_options, param):
                     val = getattr(easy_options, param)
                     if val is not None:
                         readtext_args[param] = val
         # Process differently based on detect_only flag
         if detect_only:
             # Returns tuple (horizontal_list, free_list)
             # horizontal_list is a list containing one item: the list of boxes
             # Each box is [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
-            bboxes_tuple = self._model.detect(image, **readtext_args) # Pass args here too? Check EasyOCR docs if needed.
-            if bboxes_tuple and isinstance(bboxes_tuple, tuple) and len(bboxes_tuple) > 0 and isinstance(bboxes_tuple[0], list):
-                return bboxes_tuple[0] # Return the list of polygons directly
+            bboxes_tuple = self._model.detect(
+                image, **readtext_args
+            )  # Pass args here too? Check EasyOCR docs if needed.
+            if (
+                bboxes_tuple
+                and isinstance(bboxes_tuple, tuple)
+                and len(bboxes_tuple) > 0
+                and isinstance(bboxes_tuple[0], list)
+            ):
+                return bboxes_tuple[0]  # Return the list of polygons directly
             else:
                 self.logger.warning(f"EasyOCR detect returned unexpected format: {bboxes_tuple}")
-                return [] # Return empty list on unexpected format
+                return []  # Return empty list on unexpected format
         else:
             return self._model.readtext(image, **readtext_args)
-    def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
+    def _standardize_results(
+        self, raw_results: Any, min_confidence: float, detect_only: bool
+    ) -> List[TextRegion]:
         """Convert EasyOCR results to standardized TextRegion objects."""
         standardized_regions = []
         if detect_only:
             # In detect_only mode, raw_results is already a list of bounding boxes
             # Each bbox is in [x_min, x_max, y_min, y_max] format
@@ -120,13 +149,17 @@ class EasyOCREngine(OCREngine):
                 for detection in raw_results:
                     try:
                         if isinstance(detection, (list, tuple)) and len(detection) == 4:
-                             x_min, x_max, y_min, y_max = detection
-                             # Convert to standardized (x0, y0, x1, y1) format
-                             try:
-                                 bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
-                                 standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
-                             except (ValueError, TypeError) as e:
-                                 raise ValueError(f"Invalid number format in EasyOCR detect bbox: {detection}") from e
+                            x_min, x_max, y_min, y_max = detection
+                            # Convert to standardized (x0, y0, x1, y1) format
+                            try:
+                                bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
+                                standardized_regions.append(
+                                    TextRegion(bbox, text=None, confidence=None)
+                                )
+                            except (ValueError, TypeError) as e:
+                                raise ValueError(
+                                    f"Invalid number format in EasyOCR detect bbox: {detection}"
+                                ) from e
                         else:
                             raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
                     except ValueError as e:
@@ -134,42 +167,50 @@ class EasyOCREngine(OCREngine):
                         raise e
                     except Exception as e:
                         # Catch other potential processing errors
-                        raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
+                        raise ValueError(
+                            f"Error processing EasyOCR detection item: {detection}"
+                        ) from e
             else:
-                raise ValueError(f"Expected list of bounding boxes in detect_only mode, got: {raw_results}")
+                raise ValueError(
+                    f"Expected list of bounding boxes in detect_only mode, got: {raw_results}"
+                )
             return standardized_regions
         # Full OCR mode (readtext results)
         for detection in raw_results:
             try:
                 # Detail mode (list/tuple result)
                 if isinstance(detection, (list, tuple)) and len(detection) >= 3:
-                    bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
+                    bbox_raw = detection[0]  # This is usually a polygon [[x1,y1],...]
                     text = str(detection[1])
                     confidence = float(detection[2])
                     if confidence >= min_confidence:
                         try:
                             # Use the standard helper for polygons
                             bbox = self._standardize_bbox(bbox_raw)
                             standardized_regions.append(TextRegion(bbox, text, confidence))
                         except ValueError as e:
-                            raise ValueError(f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}") from e
+                            raise ValueError(
+                                f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}"
+                            ) from e
                 # Simple mode (string result)
                 elif isinstance(detection, str):
                     if 0.0 >= min_confidence:  # Always include if min_confidence is 0
                         standardized_regions.append(TextRegion((0, 0, 0, 0), detection, 1.0))
                 else:
                     # Handle unexpected format in OCR mode
-                    raise ValueError(f"Invalid OCR detection format from EasyOCR readtext: {detection}")
+                    raise ValueError(
+                        f"Invalid OCR detection format from EasyOCR readtext: {detection}"
+                    )
             except ValueError as e:
                 # Re-raise any value errors from standardization or format checks
                 raise e
             except Exception as e:
                 # Catch other potential processing errors
                 raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
         return standardized_regions

natural_pdf/ocr/engine_paddle.py CHANGED Viewed

@@ -27,40 +27,43 @@ class PaddleOCREngine(OCREngine):
         paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
         return paddle_installed and paddleocr_installed
-    def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+    def _initialize_model(
+        self, languages: List[str], device: str, options: Optional[BaseOCROptions]
+    ):
         """Initialize the PaddleOCR model."""
         try:
-            import paddleocr
+            import paddleocr
             self.logger.info("PaddleOCR module imported successfully.")
         except ImportError as e:
-             self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
-             raise
+            self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
+            raise
         # Cast to PaddleOCROptions if possible
         paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
         # Determine parameters
         primary_lang = languages[0] if languages else "en"
         use_gpu = "cuda" in str(device).lower()
         # Create constructor arguments
         constructor_args = {
             "lang": primary_lang,
             "use_gpu": use_gpu,
             "use_angle_cls": paddle_options.use_angle_cls,
-            "det": True,
-            "rec": True  # We'll control recognition at process time
+            "det": True,
+            "rec": True,  # We'll control recognition at process time
         }
         # Add optional parameters if available
         for param in ["det_model_dir", "rec_model_dir", "cls_model_dir", "show_log", "use_onnx"]:
             if hasattr(paddle_options, param):
                 val = getattr(paddle_options, param)
                 if val is not None:
                     constructor_args[param] = val
         self.logger.debug(f"PaddleOCR constructor args: {constructor_args}")
         # Create the model
         try:
             self._model = paddleocr.PaddleOCR(**constructor_args)
@@ -78,31 +81,35 @@ class PaddleOCREngine(OCREngine):
         img_array_bgr = img_array_rgb[:, :, ::-1]  # Convert RGB to BGR
         return img_array_bgr
-    def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]) -> Any:
+    def _process_single_image(
+        self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]
+    ) -> Any:
         """Process a single image with PaddleOCR."""
         if self._model is None:
             raise RuntimeError("PaddleOCR model not initialized")
         # Prepare OCR arguments
         ocr_args = {}
         if options and isinstance(options, PaddleOCROptions):
             ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
             ocr_args["det"] = options.det
             ocr_args["rec"] = not detect_only  # Control recognition based on detect_only flag
         # Run OCR
         raw_results = self._model.ocr(image, **ocr_args)
         return raw_results
-    def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
+    def _standardize_results(
+        self, raw_results: Any, min_confidence: float, detect_only: bool
+    ) -> List[TextRegion]:
         """Convert PaddleOCR results to standardized TextRegion objects."""
         standardized_regions = []
         if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
             return standardized_regions
         page_results = raw_results[0] if raw_results[0] is not None else []
         for detection in page_results:
             # Initialize text and confidence
             text = None
@@ -111,20 +118,22 @@ class PaddleOCREngine(OCREngine):
             # Paddle always seems to return the tuple structure [bbox, (text, conf)]
             # even if rec=False. We need to parse this structure regardless.
-            if len(detection) == 4: # Handle potential alternative format?
-                 detection = [detection, ('', 1.0)] # Treat as bbox + dummy text/conf
+            if len(detection) == 4:  # Handle potential alternative format?
+                detection = [detection, ("", 1.0)]  # Treat as bbox + dummy text/conf
             if not isinstance(detection, (list, tuple)) or len(detection) < 2:
                 raise ValueError(f"Invalid detection format from PaddleOCR: {detection}")
             bbox_raw = detection[0]
             text_confidence = detection[1]
             if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
-                # Even if detect_only, we expect the (text, conf) structure,
+                # Even if detect_only, we expect the (text, conf) structure,
                 # it might just contain dummy values.
-                raise ValueError(f"Invalid text/confidence structure from PaddleOCR: {text_confidence}")
+                raise ValueError(
+                    f"Invalid text/confidence structure from PaddleOCR: {text_confidence}"
+                )
             # Extract text/conf only if not detect_only
             if not detect_only:
                 text = str(text_confidence[0])
@@ -134,7 +143,9 @@ class PaddleOCREngine(OCREngine):
             try:
                 bbox = self._standardize_bbox(bbox_raw)
             except ValueError as e:
-                raise ValueError(f"Could not standardize bounding box from PaddleOCR: {bbox_raw}") from e
+                raise ValueError(
+                    f"Could not standardize bounding box from PaddleOCR: {bbox_raw}"
+                ) from e
             # Append based on mode
             if detect_only:
@@ -143,5 +154,5 @@ class PaddleOCREngine(OCREngine):
             elif confidence >= min_confidence:
                 # Only append if confidence meets threshold in full OCR mode
                 standardized_regions.append(TextRegion(bbox, text, confidence))
         return standardized_regions

natural_pdf/ocr/engine_surya.py CHANGED Viewed

@@ -20,14 +20,16 @@ class SuryaOCREngine(OCREngine):
         self._surya_recognition = None
         self._surya_detection = None
-    def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+    def _initialize_model(
+        self, languages: List[str], device: str, options: Optional[BaseOCROptions]
+    ):
         """Initialize Surya predictors."""
         if not self.is_available():
             raise ImportError("Surya OCR library is not installed or available.")
         # Store languages for use in _process_single_image
         self._langs = languages
         from surya.detection import DetectionPredictor
         from surya.recognition import RecognitionPredictor
@@ -41,21 +43,27 @@ class SuryaOCREngine(OCREngine):
         self._detection_predictor = self._surya_detection(**predictor_args)
         self.logger.info("Instantiating Surya RecognitionPredictor...")
         self._recognition_predictor = self._surya_recognition(**predictor_args)
         self.logger.info("Surya predictors initialized.")
     def _preprocess_image(self, image: Image.Image) -> Image.Image:
         """Surya uses PIL images directly, so just return the image."""
         return image
-    def _process_single_image(self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]) -> Any:
+    def _process_single_image(
+        self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]
+    ) -> Any:
         """Process a single image with Surya OCR."""
         if not self._recognition_predictor or not self._detection_predictor:
             raise RuntimeError("Surya predictors are not initialized.")
         # Store languages instance variable during initialization to use here
-        langs = [[lang] for lang in self._langs] if hasattr(self, '_langs') else [[self.DEFAULT_LANGUAGES[0]]]
+        langs = (
+            [[lang] for lang in self._langs]
+            if hasattr(self, "_langs")
+            else [[self.DEFAULT_LANGUAGES[0]]]
+        )
         # Surya expects lists of images, so we need to wrap our single image
         if detect_only:
             results = self._detection_predictor(images=[image])
@@ -63,33 +71,41 @@ class SuryaOCREngine(OCREngine):
             results = self._recognition_predictor(
                 images=[image],
                 langs=langs,  # Use the languages set during initialization
-                det_predictor=self._detection_predictor
+                det_predictor=self._detection_predictor,
             )
         # Surya may return a list with one result per image or a single result object
         # Return the result as-is and handle the extraction in _standardize_results
         return results
-    def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
+    def _standardize_results(
+        self, raw_results: Any, min_confidence: float, detect_only: bool
+    ) -> List[TextRegion]:
         """Convert Surya results to standardized TextRegion objects."""
         standardized_regions = []
         raw_result = raw_results
         if isinstance(raw_results, list) and len(raw_results) > 0:
             raw_result = raw_results[0]
-        results = raw_result.text_lines if hasattr(raw_result, "text_lines") and not detect_only else raw_result.bboxes
+        results = (
+            raw_result.text_lines
+            if hasattr(raw_result, "text_lines") and not detect_only
+            else raw_result.bboxes
+        )
         for line in results:
             # Always extract bbox first
             try:
                 # Prioritize line.bbox, fallback to line.polygon
-                bbox_raw = line.bbox if hasattr(line, 'bbox') else getattr(line, 'polygon', None)
+                bbox_raw = line.bbox if hasattr(line, "bbox") else getattr(line, "polygon", None)
                 if bbox_raw is None:
-                     raise ValueError("Missing bbox/polygon data")
+                    raise ValueError("Missing bbox/polygon data")
                 bbox = self._standardize_bbox(bbox_raw)
             except ValueError as e:
-                raise ValueError(f"Could not standardize bounding box from Surya result: {bbox_raw}") from e
+                raise ValueError(
+                    f"Could not standardize bounding box from Surya result: {bbox_raw}"
+                ) from e
             if detect_only:
                 # For detect_only, text and confidence are None
@@ -100,7 +116,7 @@ class SuryaOCREngine(OCREngine):
                 confidence = line.confidence
                 if confidence >= min_confidence:
                     standardized_regions.append(TextRegion(bbox, text, confidence))
         return standardized_regions
     def is_available(self) -> bool:

natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

natural-pdf 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl