PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

docs/categorizing-documents/index.md +168 -0
docs/data-extraction/index.md +87 -0
docs/element-selection/index.ipynb +218 -164
docs/element-selection/index.md +20 -0
docs/index.md +19 -0
docs/ocr/index.md +63 -16
docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
docs/tutorials/02-finding-elements.ipynb +123 -46
docs/tutorials/03-extracting-blocks.ipynb +24 -19
docs/tutorials/04-table-extraction.ipynb +17 -12
docs/tutorials/05-excluding-content.ipynb +37 -32
docs/tutorials/06-document-qa.ipynb +36 -31
docs/tutorials/07-layout-analysis.ipynb +45 -40
docs/tutorials/07-working-with-regions.ipynb +61 -60
docs/tutorials/08-spatial-navigation.ipynb +76 -71
docs/tutorials/09-section-extraction.ipynb +160 -155
docs/tutorials/10-form-field-extraction.ipynb +71 -66
docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
docs/tutorials/12-ocr-integration.ipynb +3420 -312
docs/tutorials/12-ocr-integration.md +68 -106
docs/tutorials/13-semantic-search.ipynb +641 -251
natural_pdf/__init__.py +2 -0
natural_pdf/classification/manager.py +343 -0
natural_pdf/classification/mixin.py +149 -0
natural_pdf/classification/results.py +62 -0
natural_pdf/collections/mixins.py +63 -0
natural_pdf/collections/pdf_collection.py +321 -15
natural_pdf/core/element_manager.py +67 -0
natural_pdf/core/page.py +227 -64
natural_pdf/core/pdf.py +387 -378
natural_pdf/elements/collections.py +272 -41
natural_pdf/elements/region.py +99 -15
natural_pdf/elements/text.py +5 -2
natural_pdf/exporters/paddleocr.py +1 -1
natural_pdf/extraction/manager.py +134 -0
natural_pdf/extraction/mixin.py +246 -0
natural_pdf/extraction/result.py +37 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_manager.py +85 -25
natural_pdf/ocr/ocr_options.py +33 -10
natural_pdf/ocr/utils.py +14 -3
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/selectors/parser.py +363 -238
natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/text_extraction.py +52 -1
natural_pdf/utils/tqdm_utils.py +43 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0

natural_pdf/ocr/engine_easyocr.py CHANGED Viewed

@@ -143,11 +143,13 @@ class EasyOCREngine(OCREngine):
         standardized_regions = []
         if detect_only:
+            results = raw_results[0]
             # In detect_only mode, raw_results is already a list of bounding boxes
             # Each bbox is in [x_min, x_max, y_min, y_max] format
-            if isinstance(raw_results, list):
-                for detection in raw_results:
+            if isinstance(results, list):
+                for detection in results:
                     try:
+                        # This block expects 'detection' to be a list/tuple of 4 numbers
                         if isinstance(detection, (list, tuple)) and len(detection) == 4:
                             x_min, x_max, y_min, y_max = detection
                             # Convert to standardized (x0, y0, x1, y1) format
@@ -161,6 +163,7 @@ class EasyOCREngine(OCREngine):
                                     f"Invalid number format in EasyOCR detect bbox: {detection}"
                                 ) from e
                         else:
+                            # This is where the error is raised if 'detection' is not a list/tuple of 4 numbers
                             raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
                     except ValueError as e:
                         # Re-raise any value errors from standardization or format checks
@@ -172,7 +175,7 @@ class EasyOCREngine(OCREngine):
                         ) from e
             else:
                 raise ValueError(
-                    f"Expected list of bounding boxes in detect_only mode, got: {raw_results}"
+                    f"Expected list of bounding boxes in detect_only mode, got: {type(raw_results)}"
                 )
             return standardized_regions

natural_pdf/ocr/ocr_manager.py CHANGED Viewed

@@ -2,6 +2,8 @@
 import copy  # For deep copying options
 import logging
 from typing import Any, Dict, List, Optional, Type, Union
+import threading # Import threading for lock
+import time # Import time for timing
 from PIL import Image
@@ -30,30 +32,68 @@ class OCRManager:
     def __init__(self):
         """Initializes the OCR Manager."""
         self._engine_instances: Dict[str, OCREngine] = {}  # Cache for engine instances
+        self._engine_locks: Dict[str, threading.Lock] = {} # Lock per engine type for initialization
+        self._engine_inference_locks: Dict[str, threading.Lock] = {} # Lock per engine type for inference
         logger.info("OCRManager initialized.")
     def _get_engine_instance(self, engine_name: str) -> OCREngine:
-        """Retrieves or creates an instance of the specified OCR engine."""
+        """Retrieves or creates an instance of the specified OCR engine, ensuring thread-safe initialization."""
         engine_name = engine_name.lower()
         if engine_name not in self.ENGINE_REGISTRY:
             raise ValueError(
                 f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
             )
-        if engine_name not in self._engine_instances:
-            logger.info(f"Creating instance of engine: {engine_name}")
-            engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
-            engine_instance = engine_class()  # Instantiate first
-            if not engine_instance.is_available():
-                # Check availability before storing
-                # Construct helpful error message with install hint
-                install_hint = f"pip install 'natural-pdf[{engine_name}]'"
-                raise RuntimeError(
-                    f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
-                )
-            self._engine_instances[engine_name] = engine_instance  # Store if available
+        # Quick check if instance already exists (avoid lock contention)
+        if engine_name in self._engine_instances:
+            return self._engine_instances[engine_name]
-        return self._engine_instances[engine_name]
+        # Get or create the lock for this engine type
+        if engine_name not in self._engine_locks:
+            self._engine_locks[engine_name] = threading.Lock()
+        engine_init_lock = self._engine_locks[engine_name]
+        # Acquire lock to safely check and potentially initialize the engine
+        with engine_init_lock:
+            # Double-check if another thread initialized it while we waited for the lock
+            if engine_name in self._engine_instances:
+                return self._engine_instances[engine_name]
+            # If still not initialized, create it now under the lock
+            logger.info(f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}")
+            engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
+            start_time = time.monotonic() # Optional: time initialization
+            try:
+                engine_instance = engine_class()  # Instantiate first
+                if not engine_instance.is_available():
+                    # Check availability before storing
+                    install_hint = f"pip install 'natural-pdf[{engine_name}]'"
+                    raise RuntimeError(
+                        f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
+                    )
+                # Store the shared instance
+                self._engine_instances[engine_name] = engine_instance
+                end_time = time.monotonic()
+                logger.info(f"[{threading.current_thread().name}] Shared instance of {engine_name} created successfully (Duration: {end_time - start_time:.2f}s).")
+                return engine_instance
+            except Exception as e:
+                 # Ensure we don't leave a partial state if init fails
+                 logger.error(f"[{threading.current_thread().name}] Failed to create shared instance of {engine_name}: {e}", exc_info=True)
+                 # Remove potentially partial entry if exists
+                 if engine_name in self._engine_instances: del self._engine_instances[engine_name]
+                 raise # Re-raise the exception after logging
+    def _get_engine_inference_lock(self, engine_name: str) -> threading.Lock:
+        """Gets or creates the inference lock for a given engine type."""
+        engine_name = engine_name.lower()
+        # Assume engine_name is valid as it's checked before this would be called
+        if engine_name not in self._engine_inference_locks:
+            # Create lock if it doesn't exist (basic thread safety for dict access)
+            # A more robust approach might lock around this check/creation too,
+            # but contention here is less critical than for engine init or inference itself.
+            self._engine_inference_locks[engine_name] = threading.Lock()
+        return self._engine_inference_locks[engine_name]
     def apply_ocr(
         self,
@@ -127,21 +167,41 @@ class OCRManager:
         try:
             engine_instance = self._get_engine_instance(selected_engine_name)
             processing_mode = "batch" if is_batch else "single image"
-            logger.info(f"Processing {processing_mode} with engine '{selected_engine_name}'...")
+            # Log thread name for clarity during parallel calls
+            thread_id = threading.current_thread().name
+            logger.info(f"[{thread_id}] Processing {processing_mode} using shared engine instance '{selected_engine_name}'...")
             logger.debug(
                 f"  Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
             )
-            # Call the engine's process_image, passing common args and options object
-            # **ASSUMPTION**: Engine process_image signatures are updated to accept these common args.
-            results = engine_instance.process_image(
-                images=images,
-                languages=languages,
-                min_confidence=min_confidence,
-                device=device,
-                detect_only=detect_only,
-                options=final_options,
-            )
+            # Log image dimensions before processing
+            if is_batch:
+                image_dims = [f"{img.width}x{img.height}" for img in images if hasattr(img, 'width') and hasattr(img, 'height')]
+                logger.debug(f"[{thread_id}] Processing batch of {len(images)} images with dimensions: {image_dims}")
+            elif hasattr(images, 'width') and hasattr(images, 'height'):
+                logger.debug(f"[{thread_id}] Processing single image with dimensions: {images.width}x{images.height}")
+            else:
+                logger.warning(f"[{thread_id}] Could not determine dimensions of input image(s).")
+            # Acquire lock specifically for the inference call
+            inference_lock = self._get_engine_inference_lock(selected_engine_name)
+            logger.debug(f"[{thread_id}] Attempting to acquire inference lock for {selected_engine_name}...")
+            inference_wait_start = time.monotonic()
+            with inference_lock:
+                inference_acquired_time = time.monotonic()
+                logger.debug(f"[{thread_id}] Acquired inference lock for {selected_engine_name} (waited {inference_acquired_time - inference_wait_start:.2f}s). Calling process_image...")
+                inference_start_time = time.monotonic()
+                results = engine_instance.process_image(
+                    images=images,
+                    languages=languages,
+                    min_confidence=min_confidence,
+                    device=device,
+                    detect_only=detect_only,
+                    options=final_options,
+                )
+                inference_end_time = time.monotonic()
+                logger.debug(f"[{thread_id}] process_image call finished for {selected_engine_name} (Duration: {inference_end_time - inference_start_time:.2f}s). Releasing lock.")
             # Log result summary based on mode
             if is_batch:

natural_pdf/ocr/ocr_options.py CHANGED Viewed

@@ -67,9 +67,9 @@ class EasyOCROptions(BaseOCROptions):
 class PaddleOCROptions(BaseOCROptions):
     """Specific options for the PaddleOCR engine."""
-    use_angle_cls: bool = True
+    # General
     use_gpu: Optional[bool] = None
-    gpu_mem: int = 500
+    gpu_mem: int = 8000 # Default from Paddle documentation
     ir_optim: bool = True
     use_tensorrt: bool = False
     min_subgraph_size: int = 15
@@ -77,19 +77,42 @@ class PaddleOCROptions(BaseOCROptions):
     enable_mkldnn: bool = False
     cpu_threads: int = 10
     use_fp16: bool = False
+    show_log: bool = False
+    use_onnx: bool = False
+    use_zero_copy_run: bool = False
+    # Detection
+    det: bool = True
+    det_algorithm: str = "DB"
     det_model_dir: Optional[str] = None
+    det_limit_side_len: int = 960 # Corresponds to det_max_side_len
+    # DB specific
+    det_db_thresh: float = 0.3
+    det_db_box_thresh: float = 0.5
+    det_db_unclip_ratio: float = 2.0
+    # EAST specific
+    det_east_score_thresh: float = 0.8
+    det_east_cover_thresh: float = 0.1
+    det_east_nms_thresh: float = 0.2
+    # Recognition
+    rec: bool = True
+    rec_algorithm: str = "CRNN"
     rec_model_dir: Optional[str] = None
-    cls_model_dir: Optional[str] = None
-    det_limit_side_len: int = 960
-    rec_batch_num: int = 6
+    rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
+    rec_batch_num: int = 30 # Default from Paddle documentation
     max_text_length: int = 25
+    rec_char_dict_path: Optional[str] = None # Path to char dictionary file
     use_space_char: bool = True
     drop_score: float = 0.5
-    show_log: bool = False
-    use_onnx: bool = False
-    det: bool = True
-    rec: bool = True
-    cls: Optional[bool] = None
+    # Classification
+    cls: Optional[bool] = None # Often inferred from use_angle_cls
+    use_angle_cls: bool = False # Default from Paddle documentation
+    cls_model_dir: Optional[str] = None
+    cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
+    label_list: List[str] = field(default_factory=lambda: ['0', '180']) # Default from Paddle doc
+    cls_batch_num: int = 30
     def __post_init__(self):
         pass

natural_pdf/ocr/utils.py CHANGED Viewed

@@ -8,6 +8,9 @@ from tqdm.auto import tqdm
 if TYPE_CHECKING:
     from natural_pdf.elements.base import Element
+# Import the global PDF render lock from dedicated locks module
+from natural_pdf.utils.locks import pdf_render_lock
 logger = logging.getLogger(__name__)
@@ -72,7 +75,7 @@ def direct_ocr_llm(
     client,
     model="",
     resolution=150,
-    prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc.",
+    prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc. If you cannot see any text, return an empty string.",
     padding=2,
 ) -> str:
     """Convenience method to directly OCR a region of the page."""
@@ -83,7 +86,15 @@ def direct_ocr_llm(
         region = element
     buffered = io.BytesIO()
-    region_img = region.to_image(resolution=resolution, include_highlights=False)
+    # Use the global PDF render lock when rendering images
+    with pdf_render_lock:
+        region_img = region.to_image(resolution=resolution, include_highlights=False)
+    # Handle cases where image creation might fail (e.g., zero-dim region)
+    if region_img is None:
+        logger.warning(f"Could not generate image for region {region.bbox}, skipping OCR.")
+        return "" # Return empty string if image creation failed
     region_img.save(buffered, format="PNG")
     base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
@@ -107,7 +118,7 @@ def direct_ocr_llm(
         ],
     )
-    corrected = response.choices[0].message.content
+    corrected = response.choices[0].message.content.strip()
     logger.debug(f"Corrected {region.extract_text()} to {corrected}")
     return corrected

natural_pdf/qa/document_qa.py CHANGED Viewed

@@ -58,10 +58,6 @@ class DocumentQA:
             import torch
             from transformers import pipeline
-            # Determine device
-            if device is None:
-                device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
             # Initialize the pipeline

natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl