PyPI - natural-pdf - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

natural-pdf 0.1.14py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

natural_pdf/__init__.py +31 -0
natural_pdf/analyzers/layout/gemini.py +137 -162
natural_pdf/analyzers/layout/layout_manager.py +9 -5
natural_pdf/analyzers/layout/layout_options.py +77 -7
natural_pdf/analyzers/layout/paddle.py +318 -165
natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
natural_pdf/analyzers/shape_detection_mixin.py +770 -405
natural_pdf/classification/mixin.py +2 -8
natural_pdf/collections/pdf_collection.py +25 -30
natural_pdf/core/highlighting_service.py +47 -32
natural_pdf/core/page.py +226 -70
natural_pdf/core/pdf.py +19 -22
natural_pdf/elements/base.py +9 -9
natural_pdf/elements/collections.py +105 -50
natural_pdf/elements/region.py +320 -113
natural_pdf/exporters/paddleocr.py +38 -13
natural_pdf/flows/__init__.py +3 -3
natural_pdf/flows/collections.py +303 -132
natural_pdf/flows/element.py +277 -132
natural_pdf/flows/flow.py +33 -16
natural_pdf/flows/region.py +142 -79
natural_pdf/ocr/engine_doctr.py +37 -4
natural_pdf/ocr/engine_easyocr.py +23 -3
natural_pdf/ocr/engine_paddle.py +281 -30
natural_pdf/ocr/engine_surya.py +8 -3
natural_pdf/ocr/ocr_manager.py +75 -76
natural_pdf/ocr/ocr_options.py +52 -87
natural_pdf/search/__init__.py +25 -12
natural_pdf/search/lancedb_search_service.py +91 -54
natural_pdf/search/numpy_search_service.py +86 -65
natural_pdf/search/searchable_mixin.py +2 -2
natural_pdf/selectors/parser.py +125 -81
natural_pdf/widgets/__init__.py +1 -1
natural_pdf/widgets/viewer.py +205 -449
{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0

natural_pdf/ocr/engine_paddle.py CHANGED Viewed

@@ -11,10 +11,99 @@ from .ocr_options import BaseOCROptions, PaddleOCROptions
 logger = logging.getLogger(__name__)
 class PaddleOCREngine(OCREngine):
     """PaddleOCR engine implementation."""
+    SUPPORT_MATRIX = {
+        "PP-OCRv5": {"ch", "chinese_cht", "en", "japan"},
+        "PP-OCRv4": {"ch", "en"},
+        "PP-OCRv3": {
+            "abq",
+            "af",
+            "ady",
+            "ang",
+            "ar",
+            "ava",
+            "az",
+            "be",
+            "bg",
+            "bgc",
+            "bh",
+            "bho",
+            "bs",
+            "ch",
+            "che",
+            "chinese_cht",
+            "cs",
+            "cy",
+            "da",
+            "dar",
+            "de",
+            "german",
+            "en",
+            "es",
+            "et",
+            "fa",
+            "fr",
+            "french",
+            "ga",
+            "gom",
+            "hi",
+            "hr",
+            "hu",
+            "id",
+            "inh",
+            "is",
+            "it",
+            "japan",
+            "ka",
+            "kbd",
+            "korean",
+            "ku",
+            "la",
+            "lbe",
+            "lez",
+            "lt",
+            "lv",
+            "mah",
+            "mai",
+            "mi",
+            "mn",
+            "mr",
+            "ms",
+            "mt",
+            "ne",
+            "new",
+            "nl",
+            "no",
+            "oc",
+            "pi",
+            "pl",
+            "pt",
+            "ro",
+            "rs_cyrillic",
+            "rs_latin",
+            "ru",
+            "sa",
+            "sck",
+            "sk",
+            "sl",
+            "sq",
+            "sv",
+            "sw",
+            "ta",
+            "tab",
+            "te",
+            "tl",
+            "tr",
+            "ug",
+            "uk",
+            "ur",
+            "uz",
+            "vi",
+        },
+    }
     def __init__(self):
         super().__init__()
@@ -30,43 +119,160 @@ class PaddleOCREngine(OCREngine):
     def _initialize_model(
         self, languages: List[str], device: str, options: Optional[BaseOCROptions]
     ):
-        """Initialize the PaddleOCR model."""
+        """Initialize the PaddleOCR model using the >=3.0.0 pipeline API."""
         try:
             import paddleocr
             self.logger.info("PaddleOCR module imported successfully.")
         except ImportError as e:
             self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
-            raise
+            raise RuntimeError(
+                "paddleocr is not available. Please install it and paddlepaddle with: pip install -U paddlepaddle paddleocr"
+            ) from e
-        # Cast to PaddleOCROptions if possible
         paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
-        # Determine parameters
+        if len(languages) > 1:
+            self.logger.warning(
+                "PaddleOCR >= 3.0.0 only supports one language at a time. "
+                "Using the first language provided: '%s'",
+                languages[0],
+            )
         primary_lang = languages[0] if languages else "en"
-        use_gpu = "cuda" in str(device).lower()
-        # Create constructor arguments
-        constructor_args = {
+        # Determine the appropriate ocr_version based on language support
+        user_ocr_version = paddle_options.ocr_version
+        final_ocr_version = user_ocr_version
+        version_preference = ["PP-OCRv5", "PP-OCRv4", "PP-OCRv3"]
+        # --- RESTORE: Language/version support check logic ---
+        user_specified_model = (
+            getattr(paddle_options, "text_recognition_model_name", None) is not None or
+            getattr(paddle_options, "text_detection_model_name", None) is not None
+        )
+        if user_specified_model and user_ocr_version:
+            if primary_lang not in self.SUPPORT_MATRIX.get(user_ocr_version, set()):
+                self.logger.warning(
+                    f"Model '{user_ocr_version}' was explicitly specified, but language '{primary_lang}' is not officially supported. Proceeding anyway as requested."
+                )
+        if user_ocr_version:
+            if primary_lang not in self.SUPPORT_MATRIX.get(user_ocr_version, set()):
+                self.logger.warning(
+                    f"Language '{primary_lang}' is not supported by the requested ocr_version '{user_ocr_version}'. "
+                    f"Attempting to find a compatible version."
+                )
+                self.logger.warning(
+                    "Language '%s' is not supported by the requested ocr_version '%s'. "
+                    "Attempting to find a compatible version.",
+                    primary_lang,
+                    user_ocr_version,
+                )
+                final_ocr_version = None  # Reset to find a compatible version
+        # If no version was specified or the specified one was incompatible, find the best fit.
+        if not final_ocr_version:
+            found_compatible = False
+            for version in version_preference:
+                if primary_lang in self.SUPPORT_MATRIX[version]:
+                    final_ocr_version = version
+                    found_compatible = True
+                    break
+            if not found_compatible:
+                if not languages or not primary_lang:
+                    final_ocr_version = "PP-OCRv5"
+                    self.logger.info(
+                        "No language specified and no match found. Defaulting to ocr_version 'PP-OCRv5'. Note: 'PP-OCRv3' has the widest language support among PaddleOCR versions."
+                    )
+                else:
+                    self.logger.error(
+                        "Language '%s' is not supported by any available PaddleOCR version (v3, v4, v5). "
+                        "Proceeding without a specific version, but this is likely to fail.",
+                        primary_lang,
+                    )
+                    final_ocr_version = None  # Let paddleocr handle the error
+            elif final_ocr_version != "PP-OCRv5":
+                self.logger.warning(
+                    f"Automatically selected ocr_version '{final_ocr_version}' for language '{primary_lang}'. This is not the default (PP-OCRv5)."
+                )
+                self.logger.warning(
+                    "Automatically selected ocr_version '%s' for language '%s'. This is not the default (PP-OCRv5).",
+                    final_ocr_version,
+                    primary_lang,
+                )
+            # else: if PP-OCRv5, no need to log
+        elif final_ocr_version != "PP-OCRv5":
+            self.logger.warning(
+                f"Using user-specified ocr_version '{final_ocr_version}' for language '{primary_lang}'. This is not the default (PP-OCRv5)."
+            )
+            self.logger.warning(
+                "Using user-specified ocr_version '%s' for language '%s'. This is not the default (PP-OCRv5).",
+                final_ocr_version,
+                primary_lang,
+            )
+        # --- END RESTORE ---
+        # Build PaddleOCR config dict from valid constructor arguments.
+        # See: https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/OCR.html
+        valid_init_args = {
+            "doc_orientation_classify_model_name",
+            "doc_orientation_classify_model_dir",
+            "doc_unwarping_model_name",
+            "doc_unwarping_model_dir",
+            "text_detection_model_name",
+            "text_detection_model_dir",
+            "textline_orientation_model_name",
+            "textline_orientation_model_dir",
+            "text_recognition_model_name",
+            "text_recognition_model_dir",
+            "textline_orientation_batch_size",
+            "text_recognition_batch_size",
+            "use_doc_orientation_classify",
+            "use_doc_unwarping",
+            "use_textline_orientation",
+            "text_det_limit_side_len",
+            "text_det_limit_type",
+            "text_det_thresh",
+            "text_det_box_thresh",
+            "text_det_unclip_ratio",
+            "text_det_input_shape",
+            "text_rec_score_thresh",
+            "text_rec_input_shape",
+            "lang",
+            "ocr_version",
+            "device",
+            "enable_hpi",
+            "use_tensorrt",
+            "precision",
+            "enable_mkldnn",
+            # "mkldnn_cache_capacity",
+            "cpu_threads",
+            "paddlex_config",
+        }
+        # Start with defaults passed from the main apply_ocr call.
+        ocr_config = {
             "lang": primary_lang,
-            "use_gpu": use_gpu,
-            "use_angle_cls": paddle_options.use_angle_cls,
-            "det": True,
-            "rec": True,  # We'll control recognition at process time
+            "device": device,
         }
-        # Add optional parameters if available
-        for param in ["det_model_dir", "rec_model_dir", "cls_model_dir", "show_log", "use_onnx"]:
-            if hasattr(paddle_options, param):
-                val = getattr(paddle_options, param)
-                if val is not None:
-                    constructor_args[param] = val
+        # Add the determined ocr_version to the config if available
+        if final_ocr_version:
+            ocr_config["ocr_version"] = final_ocr_version
+        # Populate ocr_config from paddle_options with non-None values
+        # that are valid for the constructor. This allows overriding defaults.
+        for arg in valid_init_args:
+            if hasattr(paddle_options, arg):
+                value = getattr(paddle_options, arg)
+                if value is not None:
+                    ocr_config[arg] = value
-        self.logger.debug(f"PaddleOCR constructor args: {constructor_args}")
-        # Create the model
         try:
-            self._model = paddleocr.PaddleOCR(**constructor_args)
+            # The new API uses PaddleOCR as a pipeline object.
+            self._model = paddleocr.PaddleOCR(**ocr_config)
             self.logger.info("PaddleOCR model created successfully")
         except Exception as e:
             self.logger.error(f"Failed to create PaddleOCR model: {e}")
@@ -84,19 +290,35 @@ class PaddleOCREngine(OCREngine):
     def _process_single_image(
         self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]
     ) -> Any:
-        """Process a single image with PaddleOCR."""
+        """Process a single image with PaddleOCR using the .predict() method."""
         if self._model is None:
             raise RuntimeError("PaddleOCR model not initialized")
-        # Prepare OCR arguments
-        ocr_args = {}
+        # Prepare arguments for the .predict() method from PaddleOCROptions.
+        # See: https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/OCR.html
+        predict_args = {}
         if options and isinstance(options, PaddleOCROptions):
-            ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
-            ocr_args["det"] = options.det
-            ocr_args["rec"] = not detect_only  # Control recognition based on detect_only flag
+            valid_predict_args = {
+                "use_doc_orientation_classify",
+                "use_doc_unwarping",
+                "use_textline_orientation",
+                "text_det_limit_side_len",
+                "text_det_limit_type",
+                "text_det_thresh",
+                "text_det_box_thresh",
+                "text_det_unclip_ratio",
+                "text_rec_score_thresh",
+            }
+            for arg in valid_predict_args:
+                if hasattr(options, arg) and getattr(options, arg) is not None:
+                    predict_args[arg] = getattr(options, arg)
+        # The `detect_only` flag is handled in `_standardize_results` by ignoring
+        # the recognized text and confidence, as the new .predict() API does not
+        # have a direct flag to disable only the recognition step.
-        # Run OCR
-        raw_results = self._model.ocr(image, **ocr_args)
+        # Run OCR using the new .predict() method.
+        raw_results = self._model.predict(image)
         return raw_results
     def _standardize_results(
@@ -108,8 +330,37 @@ class PaddleOCREngine(OCREngine):
         if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
             return standardized_regions
-        page_results = raw_results[0] if raw_results[0] is not None else []
+        # New PaddleOCR 3.x format: list of dicts with keys like 'rec_texts', 'rec_scores', 'rec_boxes'
+        if isinstance(raw_results[0], dict):
+            for page in raw_results:
+                rec_texts = page.get("rec_texts", [])
+                rec_scores = page.get("rec_scores", [])
+                rec_boxes = page.get("rec_boxes", [])
+                # Fallback to dt_polys if rec_boxes is not present or empty
+                if rec_boxes is None or len(rec_boxes) == 0:
+                    rec_boxes = page.get("dt_polys", [])
+                for i in range(len(rec_texts)):
+                    text = str(rec_texts[i]) if not detect_only else None
+                    confidence = float(rec_scores[i]) if not detect_only else None
+                    # --- Bounding box format note ---
+                    # PaddleOCR 3.x may return bounding boxes in several formats:
+                    # - Rectangle: [x1, y1, x2, y2] (list or 1D numpy array of length 4)
+                    # - Polygon: [[x1, y1], [x2, y2], [x3, y3], [x4, y4]] (list of 4 points or 2D numpy array shape (4,2))
+                    # - Sometimes, rec_boxes is a numpy array of shape (N, 4) or (N, 4, 2)
+                    # This code converts any numpy array to a list before passing to _standardize_bbox,
+                    # which handles both rectangle and polygon formats robustly.
+                    box = rec_boxes[i]
+                    if hasattr(box, 'tolist'):
+                        box = box.tolist()
+                    bbox = self._standardize_bbox(box)
+                    if detect_only:
+                        standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
+                    elif confidence is not None and confidence >= min_confidence:
+                        standardized_regions.append(TextRegion(bbox, text, confidence))
+            return standardized_regions
+        # Old format fallback (list of lists/tuples)
+        page_results = raw_results[0] if raw_results[0] is not None else []
         for detection in page_results:
             # Initialize text and confidence
             text = None

natural_pdf/ocr/engine_surya.py CHANGED Viewed

@@ -38,11 +38,17 @@ class SuryaOCREngine(OCREngine):
         self.logger.info("Surya modules imported successfully.")
         predictor_args = {}  # Configure if needed
+        # Filter only allowed Surya args (currently none, but placeholder for future)
+        allowed_args = set()  # Update if Surya supports constructor args
+        filtered_args = {k: v for k, v in predictor_args.items() if k in allowed_args}
+        dropped = set(predictor_args) - allowed_args
+        if dropped:
+            self.logger.warning(f"Dropped unsupported Surya args: {dropped}")
         self.logger.info("Instantiating Surya DetectionPredictor...")
-        self._detection_predictor = self._surya_detection(**predictor_args)
+        self._detection_predictor = self._surya_detection(**filtered_args)
         self.logger.info("Instantiating Surya RecognitionPredictor...")
-        self._recognition_predictor = self._surya_recognition(**predictor_args)
+        self._recognition_predictor = self._surya_recognition(**filtered_args)
         self.logger.info("Surya predictors initialized.")
@@ -70,7 +76,6 @@ class SuryaOCREngine(OCREngine):
         else:
             results = self._recognition_predictor(
                 images=[image],
-                langs=langs,  # Use the languages set during initialization
                 det_predictor=self._detection_predictor,
             )

natural_pdf/ocr/ocr_manager.py CHANGED Viewed

@@ -83,6 +83,15 @@ class OCRManager:
                 if not engine_instance.is_available():
                     # Check availability before storing
                     install_hint = f"pip install 'natural-pdf[{engine_name}]'"
+                    if engine_name == "easyocr":
+                        install_hint = "pip install easyocr"
+                    elif engine_name == "paddle":
+                        install_hint = "pip install paddleocr paddlepaddle"
+                    elif engine_name == "surya":
+                        install_hint = "pip install surya-ocr"
+                    elif engine_name == "doctr":
+                        install_hint = "pip install 'python-doctr[torch]'"
                     raise RuntimeError(
                         f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
                     )
@@ -184,93 +193,83 @@ class OCRManager:
                     )
         # --- Get Engine Instance and Process ---
-        try:
-            engine_instance = self._get_engine_instance(selected_engine_name)
-            processing_mode = "batch" if is_batch else "single image"
-            # Log thread name for clarity during parallel calls
-            thread_id = threading.current_thread().name
-            logger.info(
-                f"[{thread_id}] Processing {processing_mode} using shared engine instance '{selected_engine_name}'..."
+        engine_instance = self._get_engine_instance(selected_engine_name)
+        processing_mode = "batch" if is_batch else "single image"
+        # Log thread name for clarity during parallel calls
+        thread_id = threading.current_thread().name
+        logger.info(
+            f"[{thread_id}] Processing {processing_mode} using shared engine instance '{selected_engine_name}'..."
+        )
+        logger.debug(
+            f"  Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
+        )
+        # Log image dimensions before processing
+        if is_batch:
+            image_dims = [
+                f"{img.width}x{img.height}"
+                for img in images
+                if hasattr(img, "width") and hasattr(img, "height")
+            ]
+            logger.debug(
+                f"[{thread_id}] Processing batch of {len(images)} images with dimensions: {image_dims}"
             )
+        elif hasattr(images, "width") and hasattr(images, "height"):
             logger.debug(
-                f"  Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
+                f"[{thread_id}] Processing single image with dimensions: {images.width}x{images.height}"
             )
+        else:
+            logger.warning(f"[{thread_id}] Could not determine dimensions of input image(s).")
-            # Log image dimensions before processing
-            if is_batch:
-                image_dims = [
-                    f"{img.width}x{img.height}"
-                    for img in images
-                    if hasattr(img, "width") and hasattr(img, "height")
-                ]
-                logger.debug(
-                    f"[{thread_id}] Processing batch of {len(images)} images with dimensions: {image_dims}"
-                )
-            elif hasattr(images, "width") and hasattr(images, "height"):
-                logger.debug(
-                    f"[{thread_id}] Processing single image with dimensions: {images.width}x{images.height}"
-                )
-            else:
-                logger.warning(f"[{thread_id}] Could not determine dimensions of input image(s).")
+        # Acquire lock specifically for the inference call
+        inference_lock = self._get_engine_inference_lock(selected_engine_name)
+        logger.debug(
+            f"[{thread_id}] Attempting to acquire inference lock for {selected_engine_name}..."
+        )
+        inference_wait_start = time.monotonic()
+        with inference_lock:
+            inference_acquired_time = time.monotonic()
+            logger.debug(
+                f"[{thread_id}] Acquired inference lock for {selected_engine_name} (waited {inference_acquired_time - inference_wait_start:.2f}s). Calling process_image..."
+            )
+            inference_start_time = time.monotonic()
-            # Acquire lock specifically for the inference call
-            inference_lock = self._get_engine_inference_lock(selected_engine_name)
+            results = engine_instance.process_image(
+                images=images,
+                languages=languages,
+                min_confidence=min_confidence,
+                device=device,
+                detect_only=detect_only,
+                options=final_options,
+            )
+            inference_end_time = time.monotonic()
             logger.debug(
-                f"[{thread_id}] Attempting to acquire inference lock for {selected_engine_name}..."
+                f"[{thread_id}] process_image call finished for {selected_engine_name} (Duration: {inference_end_time - inference_start_time:.2f}s). Releasing lock."
             )
-            inference_wait_start = time.monotonic()
-            with inference_lock:
-                inference_acquired_time = time.monotonic()
-                logger.debug(
-                    f"[{thread_id}] Acquired inference lock for {selected_engine_name} (waited {inference_acquired_time - inference_wait_start:.2f}s). Calling process_image..."
-                )
-                inference_start_time = time.monotonic()
-                results = engine_instance.process_image(
-                    images=images,
-                    languages=languages,
-                    min_confidence=min_confidence,
-                    device=device,
-                    detect_only=detect_only,
-                    options=final_options,
+        # Log result summary based on mode
+        if is_batch:
+            # Ensure results is a list before trying to get lengths
+            if isinstance(results, list):
+                num_results_per_image = [
+                    len(res_list) if isinstance(res_list, list) else -1 for res_list in results
+                ]  # Handle potential errors returning non-lists
+                logger.info(
+                    f"Processing complete. Found results per image: {num_results_per_image}"
                 )
-                inference_end_time = time.monotonic()
-                logger.debug(
-                    f"[{thread_id}] process_image call finished for {selected_engine_name} (Duration: {inference_end_time - inference_start_time:.2f}s). Releasing lock."
+            else:
+                logger.error(
+                    f"Processing complete but received unexpected result type for batch: {type(results)}"
                 )
-            # Log result summary based on mode
-            if is_batch:
-                # Ensure results is a list before trying to get lengths
-                if isinstance(results, list):
-                    num_results_per_image = [
-                        len(res_list) if isinstance(res_list, list) else -1 for res_list in results
-                    ]  # Handle potential errors returning non-lists
-                    logger.info(
-                        f"Processing complete. Found results per image: {num_results_per_image}"
-                    )
-                else:
-                    logger.error(
-                        f"Processing complete but received unexpected result type for batch: {type(results)}"
-                    )
+        else:
+            # Ensure results is a list
+            if isinstance(results, list):
+                logger.info(f"Processing complete. Found {len(results)} results.")
             else:
-                # Ensure results is a list
-                if isinstance(results, list):
-                    logger.info(f"Processing complete. Found {len(results)} results.")
-                else:
-                    logger.error(
-                        f"Processing complete but received unexpected result type for single image: {type(results)}"
-                    )
-            return results  # Return type matches input type due to engine logic
-        except (ImportError, RuntimeError, ValueError, TypeError) as e:
-            logger.error(
-                f"OCR processing failed for engine '{selected_engine_name}': {e}", exc_info=True
-            )
-            raise  # Re-raise expected errors
-        except Exception as e:
-            logger.error(f"An unexpected error occurred during OCR processing: {e}", exc_info=True)
-            raise  # Re-raise unexpected errors
+                logger.error(
+                    f"Processing complete but received unexpected result type for single image: {type(results)}"
+                )
+        return results  # Return type matches input type due to engine logic
     def get_available_engines(self) -> List[str]:
         """Returns a list of registered engine names that are currently available."""

natural-pdf 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

natural-pdf 0.1.14py3-none-any.whl → 0.1.16py3-none-any.whl