PyPI - natural-pdf - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +126 -98
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +910 -516
natural_pdf/core/pdf.py +387 -289
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +714 -514
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.3.dist-info/RECORD +0 -61
natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/ocr/engine_surya.py CHANGED Viewed

@@ -1,15 +1,17 @@
 # ocr_engine_surya.py
-import logging
 import importlib.util
-from typing import Dict, List, Any, Optional, Tuple, Union
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 from PIL import Image
 from .engine import OCREngine
-from .ocr_options import SuryaOCROptions, BaseOCROptions
+from .ocr_options import BaseOCROptions, SuryaOCROptions
 logger = logging.getLogger(__name__)
 class SuryaOCREngine(OCREngine):
     """Surya OCR engine implementation."""
@@ -30,8 +32,9 @@ class SuryaOCREngine(OCREngine):
             raise ImportError("Surya OCR library is not installed or available.")
         try:
-            from surya.recognition import RecognitionPredictor
             from surya.detection import DetectionPredictor
+            from surya.recognition import RecognitionPredictor
             self._surya_recognition = RecognitionPredictor
             self._surya_detection = DetectionPredictor
             logger.info("Surya modules imported successfully.")
@@ -40,7 +43,7 @@ class SuryaOCREngine(OCREngine):
             # Add arguments from options if Surya supports them
             # Example: device = options.device or 'cuda' if torch.cuda.is_available() else 'cpu'
             # predictor_args = {'device': options.device} # If applicable
-            predictor_args = {} # Assuming parameterless init based on example
+            predictor_args = {}  # Assuming parameterless init based on example
             logger.info("Instantiating Surya DetectionPredictor...")
             self._detection_predictor = self._surya_detection(**predictor_args)
@@ -61,13 +64,17 @@ class SuryaOCREngine(OCREngine):
         """Check if the surya library is installed."""
         return importlib.util.find_spec("surya") is not None
-    def _standardize_results(self, raw_ocr_result: Any, options: SuryaOCROptions) -> List[Dict[str, Any]]:
+    def _standardize_results(
+        self, raw_ocr_result: Any, options: SuryaOCROptions
+    ) -> List[Dict[str, Any]]:
         """Standardizes raw results from a single image from Surya."""
         standardized_page = []
         min_confidence = options.min_confidence
         # Check if the result has the expected structure (OCRResult with text_lines)
-        if not hasattr(raw_ocr_result, 'text_lines') or not isinstance(raw_ocr_result.text_lines, list):
+        if not hasattr(raw_ocr_result, "text_lines") or not isinstance(
+            raw_ocr_result.text_lines, list
+        ):
             logger.warning(f"Unexpected Surya result format: {type(raw_ocr_result)}. Skipping.")
             return standardized_page
@@ -77,52 +84,54 @@ class SuryaOCREngine(OCREngine):
                 text = line.text
                 confidence = line.confidence
                 # Surya provides both polygon and bbox, bbox is already (x0, y0, x1, y1)
-                bbox_raw = line.bbox # Use bbox directly if available and correct format
+                bbox_raw = line.bbox  # Use bbox directly if available and correct format
                 if confidence >= min_confidence:
-                    bbox = self._standardize_bbox(bbox_raw) # Validate/convert format
+                    bbox = self._standardize_bbox(bbox_raw)  # Validate/convert format
                     if bbox:
-                        standardized_page.append({
-                            'bbox': bbox,
-                            'text': text,
-                            'confidence': confidence,
-                            'source': 'ocr'
-                        })
+                        standardized_page.append(
+                            {"bbox": bbox, "text": text, "confidence": confidence, "source": "ocr"}
+                        )
                     else:
                         # Try polygon if bbox failed standardization
                         bbox_poly = self._standardize_bbox(line.polygon)
                         if bbox_poly:
-                             standardized_page.append({
-                                'bbox': bbox_poly, 'text': text, 'confidence': confidence, 'source': 'ocr'
-                             })
+                            standardized_page.append(
+                                {
+                                    "bbox": bbox_poly,
+                                    "text": text,
+                                    "confidence": confidence,
+                                    "source": "ocr",
+                                }
+                            )
                         else:
-                             logger.warning(f"Skipping Surya line due to invalid bbox/polygon: {line}")
+                            logger.warning(
+                                f"Skipping Surya line due to invalid bbox/polygon: {line}"
+                            )
             except (AttributeError, ValueError, TypeError) as e:
-                 logger.warning(f"Skipping invalid Surya TextLine format: {line}. Error: {e}")
-                 continue
+                logger.warning(f"Skipping invalid Surya TextLine format: {line}. Error: {e}")
+                continue
         return standardized_page
     def process_image(
-        self,
-        images: Union[Image.Image, List[Image.Image]],
-        options: BaseOCROptions
+        self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
     ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
         """Processes a single image or a batch of images with Surya OCR."""
         if not isinstance(options, SuryaOCROptions):
-             logger.warning("Received BaseOCROptions, expected SuryaOCROptions. Using defaults.")
-             options = SuryaOCROptions(
-                 languages=options.languages,
-                 min_confidence=options.min_confidence,
-                 device=options.device,
-                 extra_args=options.extra_args
-             )
+            logger.warning("Received BaseOCROptions, expected SuryaOCROptions. Using defaults.")
+            options = SuryaOCROptions(
+                languages=options.languages,
+                min_confidence=options.min_confidence,
+                device=options.device,
+                extra_args=options.extra_args,
+            )
         # Ensure predictors are loaded/initialized
         self._lazy_load_predictors(options)
         if not self._recognition_predictor or not self._detection_predictor:
-             raise RuntimeError("Surya predictors could not be initialized.")
+            raise RuntimeError("Surya predictors could not be initialized.")
         # --- Prepare inputs for Surya ---
         is_batch = isinstance(images, list)
@@ -131,8 +140,8 @@ class SuryaOCREngine(OCREngine):
         input_langs: List[List[str]] = [options.languages for _ in input_images]
         if not input_images:
-             logger.warning("No images provided for Surya processing.")
-             return [] if not is_batch else [[]]
+            logger.warning("No images provided for Surya processing.")
+            return [] if not is_batch else [[]]
         # --- Run Surya Prediction ---
         try:
@@ -141,24 +150,26 @@ class SuryaOCREngine(OCREngine):
             # Call Surya's predictor
             # It returns a list of OCRResult objects, one per input image
             predictions = self._recognition_predictor(
-                images=input_images,
-                langs=input_langs,
-                det_predictor=self._detection_predictor
+                images=input_images, langs=input_langs, det_predictor=self._detection_predictor
             )
             logger.info(f"Surya prediction complete. Received {len(predictions)} results.")
             # --- Standardize Results ---
             if len(predictions) != len(input_images):
-                 logger.error(f"Surya result count ({len(predictions)}) does not match input count ({len(input_images)}). Returning empty results.")
-                 # Decide on error handling: raise error or return empty structure
-                 return [[] for _ in input_images] if is_batch else []
+                logger.error(
+                    f"Surya result count ({len(predictions)}) does not match input count ({len(input_images)}). Returning empty results."
+                )
+                # Decide on error handling: raise error or return empty structure
+                return [[] for _ in input_images] if is_batch else []
-            all_standardized_results = [self._standardize_results(res, options) for res in predictions]
+            all_standardized_results = [
+                self._standardize_results(res, options) for res in predictions
+            ]
             if is_batch:
-                return all_standardized_results # Return List[List[Dict]]
+                return all_standardized_results  # Return List[List[Dict]]
             else:
-                return all_standardized_results[0] # Return List[Dict] for single image
+                return all_standardized_results[0]  # Return List[Dict] for single image
         except Exception as e:
             logger.error(f"Error during Surya OCR processing: {e}", exc_info=True)
@@ -168,4 +179,3 @@ class SuryaOCREngine(OCREngine):
     # Note: Caching is handled differently for Surya as predictors are stateful
     # and initialized once. The base class _reader_cache is not used here.
     # If predictors could be configured per-run, caching would need rethinking.

natural_pdf/ocr/ocr_manager.py CHANGED Viewed

@@ -1,68 +1,76 @@
 # ocr_manager.py
+import copy  # For deep copying options
 import logging
-from typing import Dict, List, Any, Optional, Union, Type
+from typing import Any, Dict, List, Optional, Type, Union
 from PIL import Image
-import copy # For deep copying options
 # Import engine classes and options
 from .engine import OCREngine
 from .engine_easyocr import EasyOCREngine
 from .engine_paddle import PaddleOCREngine
-from .engine_surya import SuryaOCREngine # <-- Import Surya Engine
-from .ocr_options import (
-    BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions, OCROptions # <-- Import Surya Options
-)
+from .engine_surya import SuryaOCREngine  # <-- Import Surya Engine
+from .ocr_options import OCROptions  # <-- Import Surya Options
+from .ocr_options import BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
 logger = logging.getLogger(__name__)
 class OCRManager:
     """Manages OCR engine selection, configuration, and execution."""
     # Registry mapping engine names to classes and default options
     ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
-        'easyocr': {'class': EasyOCREngine, 'options_class': EasyOCROptions},
-        'paddle': {'class': PaddleOCREngine, 'options_class': PaddleOCROptions},
-        'surya': {'class': SuryaOCREngine, 'options_class': SuryaOCROptions}, # <-- Add Surya
+        "easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
+        "paddle": {"class": PaddleOCREngine, "options_class": PaddleOCROptions},
+        "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},  # <-- Add Surya
         # Add other engines here
     }
     # Define the limited set of kwargs allowed for the simple apply_ocr call
     SIMPLE_MODE_ALLOWED_KWARGS = {
-        'engine', 'languages', 'min_confidence', 'device'
+        "engine",
+        "languages",
+        "min_confidence",
+        "device",
         # Add image pre-processing args like 'resolution', 'width' if handled here
     }
     def __init__(self):
         """Initializes the OCR Manager."""
-        self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
+        self._engine_instances: Dict[str, OCREngine] = {}  # Cache for engine instances
         logger.info("OCRManager initialized.")
     def _get_engine_instance(self, engine_name: str) -> OCREngine:
         """Retrieves or creates an instance of the specified OCR engine."""
         engine_name = engine_name.lower()
         if engine_name not in self.ENGINE_REGISTRY:
-            raise ValueError(f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
+            raise ValueError(
+                f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
+            )
         # Surya engine might manage its own predictor state, consider if caching instance is always right
         # For now, we cache the engine instance itself.
         if engine_name not in self._engine_instances:
             logger.info(f"Creating instance of engine: {engine_name}")
-            engine_class = self.ENGINE_REGISTRY[engine_name]['class']
-            engine_instance = engine_class() # Instantiate first
+            engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
+            engine_instance = engine_class()  # Instantiate first
             if not engine_instance.is_available():
-                 # Check availability before storing
-                 raise RuntimeError(f"Engine '{engine_name}' is not available. Please check dependencies.")
-            self._engine_instances[engine_name] = engine_instance # Store if available
+                # Check availability before storing
+                raise RuntimeError(
+                    f"Engine '{engine_name}' is not available. Please check dependencies."
+                )
+            self._engine_instances[engine_name] = engine_instance  # Store if available
         return self._engine_instances[engine_name]
     def apply_ocr(
         self,
-        images: Union[Image.Image, List[Image.Image]], # Accept single or list
-        engine: Optional[str] = 'easyocr', # Default engine
+        images: Union[Image.Image, List[Image.Image]],  # Accept single or list
+        engine: Optional[str] = "easyocr",  # Default engine
         options: Optional[OCROptions] = None,
-        **kwargs
-    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
+        **kwargs,
+    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:  # Return single or list of lists
         """
         Applies OCR to a single image or a batch of images using either simple
         keyword arguments or an options object.
@@ -94,54 +102,62 @@ class OCRManager:
         # --- Validate input type ---
         is_batch = isinstance(images, list)
         if not is_batch and not isinstance(images, Image.Image):
-             raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
+            raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
         # Allow engines to handle non-PIL images in list if they support it/log warnings
         # if is_batch and not all(isinstance(img, Image.Image) for img in images):
         #     logger.warning("Batch may contain items that are not PIL Images.")
         # --- Determine Options and Engine ---
         if options is not None:
             # Advanced Mode
             logger.debug(f"Using advanced mode with options object: {type(options).__name__}")
-            final_options = copy.deepcopy(options) # Prevent modification of original
+            final_options = copy.deepcopy(options)  # Prevent modification of original
             found_engine = False
             for name, registry_entry in self.ENGINE_REGISTRY.items():
                 # Check if options object is an instance of the registered options class
-                if isinstance(options, registry_entry['options_class']):
+                if isinstance(options, registry_entry["options_class"]):
                     selected_engine_name = name
                     found_engine = True
                     break
             if not found_engine:
-                 raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered engine options.")
+                raise TypeError(
+                    f"Provided options object type '{type(options).__name__}' does not match any registered engine options."
+                )
             if kwargs:
-                logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored.")
+                logger.warning(
+                    f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored."
+                )
         else:
             # Simple Mode
-            selected_engine_name = engine.lower() if engine else 'easyocr' # Fallback default
-            logger.debug(f"Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}")
+            selected_engine_name = engine.lower() if engine else "easyocr"  # Fallback default
+            logger.debug(
+                f"Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}"
+            )
             if selected_engine_name not in self.ENGINE_REGISTRY:
-                 raise ValueError(f"Unknown OCR engine: '{selected_engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
+                raise ValueError(
+                    f"Unknown OCR engine: '{selected_engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
+                )
             unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
             if unexpected_kwargs:
-                raise TypeError(f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration.")
+                raise TypeError(
+                    f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration."
+                )
             # Get the *correct* options class for the selected engine
-            options_class = self.ENGINE_REGISTRY[selected_engine_name]['options_class']
+            options_class = self.ENGINE_REGISTRY[selected_engine_name]["options_class"]
             # Create options instance using provided simple kwargs or defaults
             simple_args = {
-                'languages': kwargs.get('languages', ['en']),
-                'min_confidence': kwargs.get('min_confidence', 0.5),
-                'device': kwargs.get('device', 'cpu')
+                "languages": kwargs.get("languages", ["en"]),
+                "min_confidence": kwargs.get("min_confidence", 0.5),
+                "device": kwargs.get("device", "cpu"),
                 # Note: 'extra_args' isn't populated in simple mode
             }
             final_options = options_class(**simple_args)
             logger.debug(f"Constructed options for simple mode: {final_options}")
         # --- Get Engine Instance and Process ---
         try:
             engine_instance = self._get_engine_instance(selected_engine_name)
@@ -153,39 +169,49 @@ class OCRManager:
             # Log result summary based on mode
             if is_batch:
-                 # Ensure results is a list before trying to get lengths
-                 if isinstance(results, list):
-                     num_results_per_image = [len(res_list) if isinstance(res_list, list) else -1 for res_list in results] # Handle potential errors returning non-lists
-                     logger.info(f"Processing complete. Found results per image: {num_results_per_image}")
-                 else:
-                     logger.error(f"Processing complete but received unexpected result type for batch: {type(results)}")
+                # Ensure results is a list before trying to get lengths
+                if isinstance(results, list):
+                    num_results_per_image = [
+                        len(res_list) if isinstance(res_list, list) else -1 for res_list in results
+                    ]  # Handle potential errors returning non-lists
+                    logger.info(
+                        f"Processing complete. Found results per image: {num_results_per_image}"
+                    )
+                else:
+                    logger.error(
+                        f"Processing complete but received unexpected result type for batch: {type(results)}"
+                    )
             else:
-                 # Ensure results is a list
-                 if isinstance(results, list):
-                      logger.info(f"Processing complete. Found {len(results)} results.")
-                 else:
-                      logger.error(f"Processing complete but received unexpected result type for single image: {type(results)}")
-            return results # Return type matches input type due to engine logic
+                # Ensure results is a list
+                if isinstance(results, list):
+                    logger.info(f"Processing complete. Found {len(results)} results.")
+                else:
+                    logger.error(
+                        f"Processing complete but received unexpected result type for single image: {type(results)}"
+                    )
+            return results  # Return type matches input type due to engine logic
         except (ImportError, RuntimeError, ValueError, TypeError) as e:
-             logger.error(f"OCR processing failed for engine '{selected_engine_name}': {e}", exc_info=True)
-             raise # Re-raise expected errors
+            logger.error(
+                f"OCR processing failed for engine '{selected_engine_name}': {e}", exc_info=True
+            )
+            raise  # Re-raise expected errors
         except Exception as e:
-             logger.error(f"An unexpected error occurred during OCR processing: {e}", exc_info=True)
-             raise # Re-raise unexpected errors
+            logger.error(f"An unexpected error occurred during OCR processing: {e}", exc_info=True)
+            raise  # Re-raise unexpected errors
     def get_available_engines(self) -> List[str]:
         """Returns a list of registered engine names that are currently available."""
         available = []
         for name, registry_entry in self.ENGINE_REGISTRY.items():
-             try:
-                 # Temporarily instantiate to check availability without caching
-                 engine_class = registry_entry['class']
-                 if engine_class().is_available():
-                     available.append(name)
-             except Exception as e:
-                 logger.debug(f"Engine '{name}' check failed: {e}") # Log check failures at debug level
-                 pass # Ignore engines that fail to instantiate or check
+            try:
+                # Temporarily instantiate to check availability without caching
+                engine_class = registry_entry["class"]
+                if engine_class().is_available():
+                    available.append(name)
+            except Exception as e:
+                logger.debug(
+                    f"Engine '{name}' check failed: {e}"
+                )  # Log check failures at debug level
+                pass  # Ignore engines that fail to instantiate or check
         return available

natural_pdf/ocr/ocr_options.py CHANGED Viewed

@@ -1,30 +1,34 @@
 # ocr_options.py
 import logging
 from dataclasses import dataclass, field
-from typing import List, Optional, Dict, Any, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 # Configure logging
 # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 # logger = logging.getLogger(__name__)
 # Assume logger is configured elsewhere or remove if not needed globally
 # --- Base Options ---
 @dataclass
 class BaseOCROptions:
     """Base class for OCR engine options."""
-    languages: List[str] = field(default_factory=lambda: ['en'])
+    languages: List[str] = field(default_factory=lambda: ["en"])
     min_confidence: float = 0.5
-    device: Optional[str] = 'cpu' # Suggestion, actual device usage depends on engine impl.
+    device: Optional[str] = "cpu"  # Suggestion, actual device usage depends on engine impl.
     extra_args: Dict[str, Any] = field(default_factory=dict)
 # --- EasyOCR Specific Options ---
 @dataclass
 class EasyOCROptions(BaseOCROptions):
     """Specific options for the EasyOCR engine."""
     model_storage_directory: Optional[str] = None
     user_network_directory: Optional[str] = None
-    recog_network: str = 'english_g2'
-    detect_network: str = 'craft'
+    recog_network: str = "english_g2"
+    detect_network: str = "craft"
     download_enabled: bool = True
     detector: bool = True
     recognizer: bool = True
@@ -32,7 +36,7 @@ class EasyOCROptions(BaseOCROptions):
     quantize: bool = True
     cudnn_benchmark: bool = False
     detail: int = 1
-    decoder: str = 'greedy'
+    decoder: str = "greedy"
     beamWidth: int = 5
     batch_size: int = 1
     workers: int = 0
@@ -55,7 +59,7 @@ class EasyOCROptions(BaseOCROptions):
     y_ths: float = 0.5
     x_ths: float = 1.0
     add_margin: float = 0.1
-    output_format: str = 'standard'
+    output_format: str = "standard"
     # def __post_init__(self):
     #     logger.debug(f"Initialized EasyOCROptions: {self}")
@@ -65,13 +69,14 @@ class EasyOCROptions(BaseOCROptions):
 @dataclass
 class PaddleOCROptions(BaseOCROptions):
     """Specific options for the PaddleOCR engine."""
     use_angle_cls: bool = True
     use_gpu: Optional[bool] = None
     gpu_mem: int = 500
     ir_optim: bool = True
     use_tensorrt: bool = False
     min_subgraph_size: int = 15
-    precision: str = 'fp32'
+    precision: str = "fp32"
     enable_mkldnn: bool = False
     cpu_threads: int = 10
     use_fp16: bool = False
@@ -91,16 +96,18 @@ class PaddleOCROptions(BaseOCROptions):
     def __post_init__(self):
         if self.use_gpu is None:
-            if self.device and 'cuda' in self.device.lower():
+            if self.device and "cuda" in self.device.lower():
                 self.use_gpu = True
             else:
                 self.use_gpu = False
         # logger.debug(f"Initialized PaddleOCROptions: {self}")
 # --- Surya Specific Options ---
 @dataclass
 class SuryaOCROptions(BaseOCROptions):
     """Specific options for the Surya OCR engine."""
     # Currently, Surya example shows languages passed at prediction time.
     # Add fields here if Surya's RecognitionPredictor or DetectionPredictor
     # constructors accept relevant arguments (e.g., model paths, device settings).
@@ -111,4 +118,3 @@ class SuryaOCROptions(BaseOCROptions):
 # --- Union type for type hinting ---
 OCROptions = Union[EasyOCROptions, PaddleOCROptions, SuryaOCROptions, BaseOCROptions]

natural_pdf/qa/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from natural_pdf.qa.document_qa import DocumentQA, get_qa_engine
-__all__ = ["DocumentQA", "get_qa_engine"]
+__all__ = ["DocumentQA", "get_qa_engine"]

natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl