PyPI - natural-pdf - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

natural-pdf 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

docs/categorizing-documents/index.md +168 -0
docs/data-extraction/index.md +87 -0
docs/element-selection/index.ipynb +218 -164
docs/element-selection/index.md +20 -0
docs/finetuning/index.md +176 -0
docs/index.md +19 -0
docs/ocr/index.md +63 -16
docs/tutorials/01-loading-and-extraction.ipynb +411 -248
docs/tutorials/02-finding-elements.ipynb +123 -46
docs/tutorials/03-extracting-blocks.ipynb +24 -19
docs/tutorials/04-table-extraction.ipynb +17 -12
docs/tutorials/05-excluding-content.ipynb +37 -32
docs/tutorials/06-document-qa.ipynb +36 -31
docs/tutorials/07-layout-analysis.ipynb +45 -40
docs/tutorials/07-working-with-regions.ipynb +61 -60
docs/tutorials/08-spatial-navigation.ipynb +76 -71
docs/tutorials/09-section-extraction.ipynb +160 -155
docs/tutorials/10-form-field-extraction.ipynb +71 -66
docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
docs/tutorials/12-ocr-integration.ipynb +3420 -312
docs/tutorials/12-ocr-integration.md +68 -106
docs/tutorials/13-semantic-search.ipynb +641 -251
natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/gemini.py +63 -47
natural_pdf/classification/manager.py +343 -0
natural_pdf/classification/mixin.py +149 -0
natural_pdf/classification/results.py +62 -0
natural_pdf/collections/mixins.py +63 -0
natural_pdf/collections/pdf_collection.py +326 -17
natural_pdf/core/element_manager.py +73 -4
natural_pdf/core/page.py +255 -83
natural_pdf/core/pdf.py +385 -367
natural_pdf/elements/base.py +1 -3
natural_pdf/elements/collections.py +279 -49
natural_pdf/elements/region.py +106 -21
natural_pdf/elements/text.py +5 -2
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/extraction/manager.py +134 -0
natural_pdf/extraction/mixin.py +246 -0
natural_pdf/extraction/result.py +37 -0
natural_pdf/ocr/__init__.py +16 -8
natural_pdf/ocr/engine.py +46 -30
natural_pdf/ocr/engine_easyocr.py +86 -42
natural_pdf/ocr/engine_paddle.py +39 -28
natural_pdf/ocr/engine_surya.py +32 -16
natural_pdf/ocr/ocr_factory.py +34 -23
natural_pdf/ocr/ocr_manager.py +98 -34
natural_pdf/ocr/ocr_options.py +38 -10
natural_pdf/ocr/utils.py +59 -33
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/selectors/parser.py +363 -238
natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
natural_pdf/utils/debug.py +4 -2
natural_pdf/utils/identifiers.py +9 -5
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +172 -105
natural_pdf/utils/text_extraction.py +96 -65
natural_pdf/utils/tqdm_utils.py +43 -0
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0

natural_pdf/ocr/engine_easyocr.py CHANGED Viewed

@@ -18,28 +18,31 @@ class EasyOCREngine(OCREngine):
     def __init__(self):
         super().__init__()
         # No longer need _easyocr attribute
-        # self._easyocr = None
+        # self._easyocr = None
     def is_available(self) -> bool:
         """Check if EasyOCR is installed."""
         return importlib.util.find_spec("easyocr") is not None
-    def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+    def _initialize_model(
+        self, languages: List[str], device: str, options: Optional[BaseOCROptions]
+    ):
         """Initialize the EasyOCR model."""
         # Import directly here
         try:
             import easyocr
             self.logger.info("EasyOCR module imported successfully.")
         except ImportError as e:
             self.logger.error(f"Failed to import EasyOCR: {e}")
             raise
         # Cast to EasyOCROptions if possible, otherwise use default
         easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
         # Prepare constructor arguments
         use_gpu = "cuda" in device.lower() or "mps" in device.lower()
         constructor_args = {
             "lang_list": languages,
             "gpu": use_gpu,
@@ -55,12 +58,12 @@ class EasyOCREngine(OCREngine):
             "quantize": easy_options.quantize,
             "cudnn_benchmark": easy_options.cudnn_benchmark,
         }
         # Filter out None values, as EasyOCR expects non-None or default behaviour
         constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
         self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
         # Create the reader
         try:
             self._model = easyocr.Reader(**constructor_args)
@@ -73,103 +76,144 @@ class EasyOCREngine(OCREngine):
         """Convert PIL Image to numpy array for EasyOCR."""
         return np.array(image)
-    def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]) -> Any:
+    def _process_single_image(
+        self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]
+    ) -> Any:
         """Process a single image with EasyOCR."""
         if self._model is None:
             raise RuntimeError("EasyOCR model not initialized")
         # Cast options to proper type if provided
         easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
         # Prepare readtext arguments (only needed if not detect_only)
         readtext_args = {}
         if not detect_only:
             for param in [
-                "detail", "paragraph", "min_size", "contrast_ths", "adjust_contrast",
-                "filter_ths", "text_threshold", "low_text", "link_threshold",
-                "canvas_size", "mag_ratio", "slope_ths", "ycenter_ths", "height_ths",
-                "width_ths", "y_ths", "x_ths", "add_margin", "output_format"
+                "detail",
+                "paragraph",
+                "min_size",
+                "contrast_ths",
+                "adjust_contrast",
+                "filter_ths",
+                "text_threshold",
+                "low_text",
+                "link_threshold",
+                "canvas_size",
+                "mag_ratio",
+                "slope_ths",
+                "ycenter_ths",
+                "height_ths",
+                "width_ths",
+                "y_ths",
+                "x_ths",
+                "add_margin",
+                "output_format",
             ]:
                 if hasattr(easy_options, param):
                     val = getattr(easy_options, param)
                     if val is not None:
                         readtext_args[param] = val
         # Process differently based on detect_only flag
         if detect_only:
             # Returns tuple (horizontal_list, free_list)
             # horizontal_list is a list containing one item: the list of boxes
             # Each box is [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
-            bboxes_tuple = self._model.detect(image, **readtext_args) # Pass args here too? Check EasyOCR docs if needed.
-            if bboxes_tuple and isinstance(bboxes_tuple, tuple) and len(bboxes_tuple) > 0 and isinstance(bboxes_tuple[0], list):
-                return bboxes_tuple[0] # Return the list of polygons directly
+            bboxes_tuple = self._model.detect(
+                image, **readtext_args
+            )  # Pass args here too? Check EasyOCR docs if needed.
+            if (
+                bboxes_tuple
+                and isinstance(bboxes_tuple, tuple)
+                and len(bboxes_tuple) > 0
+                and isinstance(bboxes_tuple[0], list)
+            ):
+                return bboxes_tuple[0]  # Return the list of polygons directly
             else:
                 self.logger.warning(f"EasyOCR detect returned unexpected format: {bboxes_tuple}")
-                return [] # Return empty list on unexpected format
+                return []  # Return empty list on unexpected format
         else:
             return self._model.readtext(image, **readtext_args)
-    def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
+    def _standardize_results(
+        self, raw_results: Any, min_confidence: float, detect_only: bool
+    ) -> List[TextRegion]:
         """Convert EasyOCR results to standardized TextRegion objects."""
         standardized_regions = []
         if detect_only:
+            results = raw_results[0]
             # In detect_only mode, raw_results is already a list of bounding boxes
             # Each bbox is in [x_min, x_max, y_min, y_max] format
-            if isinstance(raw_results, list):
-                for detection in raw_results:
+            if isinstance(results, list):
+                for detection in results:
                     try:
+                        # This block expects 'detection' to be a list/tuple of 4 numbers
                         if isinstance(detection, (list, tuple)) and len(detection) == 4:
-                             x_min, x_max, y_min, y_max = detection
-                             # Convert to standardized (x0, y0, x1, y1) format
-                             try:
-                                 bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
-                                 standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
-                             except (ValueError, TypeError) as e:
-                                 raise ValueError(f"Invalid number format in EasyOCR detect bbox: {detection}") from e
+                            x_min, x_max, y_min, y_max = detection
+                            # Convert to standardized (x0, y0, x1, y1) format
+                            try:
+                                bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
+                                standardized_regions.append(
+                                    TextRegion(bbox, text=None, confidence=None)
+                                )
+                            except (ValueError, TypeError) as e:
+                                raise ValueError(
+                                    f"Invalid number format in EasyOCR detect bbox: {detection}"
+                                ) from e
                         else:
+                            # This is where the error is raised if 'detection' is not a list/tuple of 4 numbers
                             raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
                     except ValueError as e:
                         # Re-raise any value errors from standardization or format checks
                         raise e
                     except Exception as e:
                         # Catch other potential processing errors
-                        raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
+                        raise ValueError(
+                            f"Error processing EasyOCR detection item: {detection}"
+                        ) from e
             else:
-                raise ValueError(f"Expected list of bounding boxes in detect_only mode, got: {raw_results}")
+                raise ValueError(
+                    f"Expected list of bounding boxes in detect_only mode, got: {type(raw_results)}"
+                )
             return standardized_regions
         # Full OCR mode (readtext results)
         for detection in raw_results:
             try:
                 # Detail mode (list/tuple result)
                 if isinstance(detection, (list, tuple)) and len(detection) >= 3:
-                    bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
+                    bbox_raw = detection[0]  # This is usually a polygon [[x1,y1],...]
                     text = str(detection[1])
                     confidence = float(detection[2])
                     if confidence >= min_confidence:
                         try:
                             # Use the standard helper for polygons
                             bbox = self._standardize_bbox(bbox_raw)
                             standardized_regions.append(TextRegion(bbox, text, confidence))
                         except ValueError as e:
-                            raise ValueError(f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}") from e
+                            raise ValueError(
+                                f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}"
+                            ) from e
                 # Simple mode (string result)
                 elif isinstance(detection, str):
                     if 0.0 >= min_confidence:  # Always include if min_confidence is 0
                         standardized_regions.append(TextRegion((0, 0, 0, 0), detection, 1.0))
                 else:
                     # Handle unexpected format in OCR mode
-                    raise ValueError(f"Invalid OCR detection format from EasyOCR readtext: {detection}")
+                    raise ValueError(
+                        f"Invalid OCR detection format from EasyOCR readtext: {detection}"
+                    )
             except ValueError as e:
                 # Re-raise any value errors from standardization or format checks
                 raise e
             except Exception as e:
                 # Catch other potential processing errors
                 raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
         return standardized_regions

natural_pdf/ocr/engine_paddle.py CHANGED Viewed

@@ -27,40 +27,43 @@ class PaddleOCREngine(OCREngine):
         paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
         return paddle_installed and paddleocr_installed
-    def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+    def _initialize_model(
+        self, languages: List[str], device: str, options: Optional[BaseOCROptions]
+    ):
         """Initialize the PaddleOCR model."""
         try:
-            import paddleocr
+            import paddleocr
             self.logger.info("PaddleOCR module imported successfully.")
         except ImportError as e:
-             self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
-             raise
+            self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
+            raise
         # Cast to PaddleOCROptions if possible
         paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
         # Determine parameters
         primary_lang = languages[0] if languages else "en"
         use_gpu = "cuda" in str(device).lower()
         # Create constructor arguments
         constructor_args = {
             "lang": primary_lang,
             "use_gpu": use_gpu,
             "use_angle_cls": paddle_options.use_angle_cls,
-            "det": True,
-            "rec": True  # We'll control recognition at process time
+            "det": True,
+            "rec": True,  # We'll control recognition at process time
         }
         # Add optional parameters if available
         for param in ["det_model_dir", "rec_model_dir", "cls_model_dir", "show_log", "use_onnx"]:
             if hasattr(paddle_options, param):
                 val = getattr(paddle_options, param)
                 if val is not None:
                     constructor_args[param] = val
         self.logger.debug(f"PaddleOCR constructor args: {constructor_args}")
         # Create the model
         try:
             self._model = paddleocr.PaddleOCR(**constructor_args)
@@ -78,31 +81,35 @@ class PaddleOCREngine(OCREngine):
         img_array_bgr = img_array_rgb[:, :, ::-1]  # Convert RGB to BGR
         return img_array_bgr
-    def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]) -> Any:
+    def _process_single_image(
+        self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]
+    ) -> Any:
         """Process a single image with PaddleOCR."""
         if self._model is None:
             raise RuntimeError("PaddleOCR model not initialized")
         # Prepare OCR arguments
         ocr_args = {}
         if options and isinstance(options, PaddleOCROptions):
             ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
             ocr_args["det"] = options.det
             ocr_args["rec"] = not detect_only  # Control recognition based on detect_only flag
         # Run OCR
         raw_results = self._model.ocr(image, **ocr_args)
         return raw_results
-    def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
+    def _standardize_results(
+        self, raw_results: Any, min_confidence: float, detect_only: bool
+    ) -> List[TextRegion]:
         """Convert PaddleOCR results to standardized TextRegion objects."""
         standardized_regions = []
         if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
             return standardized_regions
         page_results = raw_results[0] if raw_results[0] is not None else []
         for detection in page_results:
             # Initialize text and confidence
             text = None
@@ -111,20 +118,22 @@ class PaddleOCREngine(OCREngine):
             # Paddle always seems to return the tuple structure [bbox, (text, conf)]
             # even if rec=False. We need to parse this structure regardless.
-            if len(detection) == 4: # Handle potential alternative format?
-                 detection = [detection, ('', 1.0)] # Treat as bbox + dummy text/conf
+            if len(detection) == 4:  # Handle potential alternative format?
+                detection = [detection, ("", 1.0)]  # Treat as bbox + dummy text/conf
             if not isinstance(detection, (list, tuple)) or len(detection) < 2:
                 raise ValueError(f"Invalid detection format from PaddleOCR: {detection}")
             bbox_raw = detection[0]
             text_confidence = detection[1]
             if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
-                # Even if detect_only, we expect the (text, conf) structure,
+                # Even if detect_only, we expect the (text, conf) structure,
                 # it might just contain dummy values.
-                raise ValueError(f"Invalid text/confidence structure from PaddleOCR: {text_confidence}")
+                raise ValueError(
+                    f"Invalid text/confidence structure from PaddleOCR: {text_confidence}"
+                )
             # Extract text/conf only if not detect_only
             if not detect_only:
                 text = str(text_confidence[0])
@@ -134,7 +143,9 @@ class PaddleOCREngine(OCREngine):
             try:
                 bbox = self._standardize_bbox(bbox_raw)
             except ValueError as e:
-                raise ValueError(f"Could not standardize bounding box from PaddleOCR: {bbox_raw}") from e
+                raise ValueError(
+                    f"Could not standardize bounding box from PaddleOCR: {bbox_raw}"
+                ) from e
             # Append based on mode
             if detect_only:
@@ -143,5 +154,5 @@ class PaddleOCREngine(OCREngine):
             elif confidence >= min_confidence:
                 # Only append if confidence meets threshold in full OCR mode
                 standardized_regions.append(TextRegion(bbox, text, confidence))
         return standardized_regions

natural_pdf/ocr/engine_surya.py CHANGED Viewed

@@ -20,14 +20,16 @@ class SuryaOCREngine(OCREngine):
         self._surya_recognition = None
         self._surya_detection = None
-    def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
+    def _initialize_model(
+        self, languages: List[str], device: str, options: Optional[BaseOCROptions]
+    ):
         """Initialize Surya predictors."""
         if not self.is_available():
             raise ImportError("Surya OCR library is not installed or available.")
         # Store languages for use in _process_single_image
         self._langs = languages
         from surya.detection import DetectionPredictor
         from surya.recognition import RecognitionPredictor
@@ -41,21 +43,27 @@ class SuryaOCREngine(OCREngine):
         self._detection_predictor = self._surya_detection(**predictor_args)
         self.logger.info("Instantiating Surya RecognitionPredictor...")
         self._recognition_predictor = self._surya_recognition(**predictor_args)
         self.logger.info("Surya predictors initialized.")
     def _preprocess_image(self, image: Image.Image) -> Image.Image:
         """Surya uses PIL images directly, so just return the image."""
         return image
-    def _process_single_image(self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]) -> Any:
+    def _process_single_image(
+        self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]
+    ) -> Any:
         """Process a single image with Surya OCR."""
         if not self._recognition_predictor or not self._detection_predictor:
             raise RuntimeError("Surya predictors are not initialized.")
         # Store languages instance variable during initialization to use here
-        langs = [[lang] for lang in self._langs] if hasattr(self, '_langs') else [[self.DEFAULT_LANGUAGES[0]]]
+        langs = (
+            [[lang] for lang in self._langs]
+            if hasattr(self, "_langs")
+            else [[self.DEFAULT_LANGUAGES[0]]]
+        )
         # Surya expects lists of images, so we need to wrap our single image
         if detect_only:
             results = self._detection_predictor(images=[image])
@@ -63,33 +71,41 @@ class SuryaOCREngine(OCREngine):
             results = self._recognition_predictor(
                 images=[image],
                 langs=langs,  # Use the languages set during initialization
-                det_predictor=self._detection_predictor
+                det_predictor=self._detection_predictor,
             )
         # Surya may return a list with one result per image or a single result object
         # Return the result as-is and handle the extraction in _standardize_results
         return results
-    def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
+    def _standardize_results(
+        self, raw_results: Any, min_confidence: float, detect_only: bool
+    ) -> List[TextRegion]:
         """Convert Surya results to standardized TextRegion objects."""
         standardized_regions = []
         raw_result = raw_results
         if isinstance(raw_results, list) and len(raw_results) > 0:
             raw_result = raw_results[0]
-        results = raw_result.text_lines if hasattr(raw_result, "text_lines") and not detect_only else raw_result.bboxes
+        results = (
+            raw_result.text_lines
+            if hasattr(raw_result, "text_lines") and not detect_only
+            else raw_result.bboxes
+        )
         for line in results:
             # Always extract bbox first
             try:
                 # Prioritize line.bbox, fallback to line.polygon
-                bbox_raw = line.bbox if hasattr(line, 'bbox') else getattr(line, 'polygon', None)
+                bbox_raw = line.bbox if hasattr(line, "bbox") else getattr(line, "polygon", None)
                 if bbox_raw is None:
-                     raise ValueError("Missing bbox/polygon data")
+                    raise ValueError("Missing bbox/polygon data")
                 bbox = self._standardize_bbox(bbox_raw)
             except ValueError as e:
-                raise ValueError(f"Could not standardize bounding box from Surya result: {bbox_raw}") from e
+                raise ValueError(
+                    f"Could not standardize bounding box from Surya result: {bbox_raw}"
+                ) from e
             if detect_only:
                 # For detect_only, text and confidence are None
@@ -100,7 +116,7 @@ class SuryaOCREngine(OCREngine):
                 confidence = line.confidence
                 if confidence >= min_confidence:
                     standardized_regions.append(TextRegion(bbox, text, confidence))
         return standardized_regions
     def is_available(self) -> bool:

natural_pdf/ocr/ocr_factory.py CHANGED Viewed

@@ -13,14 +13,14 @@ class OCRFactory:
     @staticmethod
     def create_engine(engine_type: str, **kwargs) -> OCREngine:
         """Create and return an OCR engine instance.
         Args:
             engine_type: One of 'surya', 'easyocr', 'paddle'
             **kwargs: Arguments to pass to the engine constructor
         Returns:
             An initialized OCR engine
         Raises:
             ImportError: If the required dependencies aren't installed
             ValueError: If the engine_type is unknown
@@ -28,72 +28,83 @@ class OCRFactory:
         if engine_type == "surya":
             try:
                 from .engine_surya import SuryaOCREngine
                 return SuryaOCREngine(**kwargs)
             except ImportError:
-                raise ImportError("Surya engine requires the 'surya' package. "
-                                 "Install with: pip install surya")
+                raise ImportError(
+                    "Surya engine requires the 'surya' package. " "Install with: pip install surya"
+                )
         elif engine_type == "easyocr":
             try:
                 from .engine_easyocr import EasyOCREngine
                 return EasyOCREngine(**kwargs)
             except ImportError:
-                raise ImportError("EasyOCR engine requires the 'easyocr' package. "
-                                 "Install with: pip install easyocr")
+                raise ImportError(
+                    "EasyOCR engine requires the 'easyocr' package. "
+                    "Install with: pip install easyocr"
+                )
         elif engine_type == "paddle":
             try:
                 from .engine_paddle import PaddleOCREngine
                 return PaddleOCREngine(**kwargs)
             except ImportError:
-                raise ImportError("PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
-                                 "Install with: pip install paddleocr paddlepaddle")
+                raise ImportError(
+                    "PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
+                    "Install with: pip install paddleocr paddlepaddle"
+                )
         else:
             raise ValueError(f"Unknown engine type: {engine_type}")
     @staticmethod
     def list_available_engines() -> Dict[str, bool]:
         """Returns a dictionary of engine names and their availability status."""
         engines = {}
         # Check Surya
         try:
             engines["surya"] = importlib.util.find_spec("surya") is not None
         except ImportError:
             engines["surya"] = False
         # Check EasyOCR
         try:
             engines["easyocr"] = importlib.util.find_spec("easyocr") is not None
         except ImportError:
             engines["easyocr"] = False
         # Check PaddleOCR
         try:
-            paddle = importlib.util.find_spec("paddle") is not None or importlib.util.find_spec("paddlepaddle") is not None
+            paddle = (
+                importlib.util.find_spec("paddle") is not None
+                or importlib.util.find_spec("paddlepaddle") is not None
+            )
             paddleocr = importlib.util.find_spec("paddleocr") is not None
             engines["paddle"] = paddle and paddleocr
         except ImportError:
             engines["paddle"] = False
         return engines
     @staticmethod
     def get_recommended_engine(**kwargs) -> OCREngine:
         """Returns the best available OCR engine based on what's installed.
         First tries engines in order of preference: EasyOCR, Paddle, Surya.
         If none are available, raises ImportError with installation instructions.
         Args:
             **kwargs: Arguments to pass to the engine constructor
         Returns:
             The best available OCR engine instance
         Raises:
             ImportError: If no engines are available
         """
         available = OCRFactory.list_available_engines()
         # Try engines in order of recommendation
         if available.get("easyocr", False):
             logger.info("Using EasyOCR engine (recommended)")
@@ -104,11 +115,11 @@ class OCRFactory:
         elif available.get("surya", False):
             logger.info("Using Surya OCR engine")
             return OCRFactory.create_engine("surya", **kwargs)
         # If we get here, no engines are available
         raise ImportError(
             "No OCR engines available. Please install at least one of: \n"
             "- EasyOCR (recommended): pip install easyocr\n"
             "- PaddleOCR: pip install paddleocr paddlepaddle\n"
             "- Surya OCR: pip install surya"
-        )
+        )

natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

natural-pdf 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl