PyPI - natural-pdf - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

natural-pdf 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

docs/finetuning/index.md +176 -0
docs/ocr/index.md +34 -47
docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
docs/tutorials/02-finding-elements.ipynb +42 -42
docs/tutorials/03-extracting-blocks.ipynb +17 -17
docs/tutorials/04-table-extraction.ipynb +12 -12
docs/tutorials/05-excluding-content.ipynb +30 -30
docs/tutorials/06-document-qa.ipynb +28 -28
docs/tutorials/07-layout-analysis.ipynb +63 -35
docs/tutorials/07-working-with-regions.ipynb +55 -51
docs/tutorials/07-working-with-regions.md +2 -2
docs/tutorials/08-spatial-navigation.ipynb +60 -60
docs/tutorials/09-section-extraction.ipynb +113 -113
docs/tutorials/10-form-field-extraction.ipynb +78 -50
docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
docs/tutorials/12-ocr-integration.ipynb +149 -131
docs/tutorials/12-ocr-integration.md +0 -13
docs/tutorials/13-semantic-search.ipynb +313 -873
natural_pdf/__init__.py +21 -22
natural_pdf/analyzers/layout/gemini.py +280 -0
natural_pdf/analyzers/layout/layout_manager.py +28 -1
natural_pdf/analyzers/layout/layout_options.py +11 -0
natural_pdf/analyzers/layout/yolo.py +6 -2
natural_pdf/collections/pdf_collection.py +24 -0
natural_pdf/core/element_manager.py +18 -13
natural_pdf/core/page.py +174 -36
natural_pdf/core/pdf.py +156 -42
natural_pdf/elements/base.py +9 -17
natural_pdf/elements/collections.py +99 -38
natural_pdf/elements/region.py +77 -37
natural_pdf/elements/text.py +5 -0
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/ocr/__init__.py +57 -36
natural_pdf/ocr/engine.py +160 -49
natural_pdf/ocr/engine_easyocr.py +178 -157
natural_pdf/ocr/engine_paddle.py +114 -189
natural_pdf/ocr/engine_surya.py +87 -144
natural_pdf/ocr/ocr_factory.py +125 -0
natural_pdf/ocr/ocr_manager.py +65 -89
natural_pdf/ocr/ocr_options.py +8 -13
natural_pdf/ocr/utils.py +113 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
natural_pdf/templates/spa/css/style.css +334 -0
natural_pdf/templates/spa/index.html +31 -0
natural_pdf/templates/spa/js/app.js +472 -0
natural_pdf/templates/spa/words.txt +235976 -0
natural_pdf/utils/debug.py +34 -0
natural_pdf/utils/identifiers.py +33 -0
natural_pdf/utils/packaging.py +485 -0
natural_pdf/utils/text_extraction.py +44 -64
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
natural_pdf/templates/ocr_debug.html +0 -517
tests/test_loading.py +0 -50
tests/test_optional_deps.py +0 -298
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0

natural_pdf/core/page.py CHANGED Viewed

@@ -10,7 +10,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 import pdfplumber
-from PIL import Image
+from PIL import Image, ImageDraw
 from natural_pdf.elements.collections import ElementCollection
 from natural_pdf.elements.region import Region
@@ -43,6 +43,9 @@ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_t
 from natural_pdf.widgets import InteractiveViewerWidget
 from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
+from natural_pdf.qa import DocumentQA, get_qa_engine
+from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 logger = logging.getLogger(__name__)
@@ -1230,6 +1233,7 @@ class Page:
         render_ocr: bool = False,
         resolution: Optional[float] = None,
         include_highlights: bool = True,
+        exclusions: Optional[str] = None,  # New parameter
         **kwargs,
     ) -> Optional[Image.Image]:
         """
@@ -1244,27 +1248,29 @@ class Page:
             render_ocr: Whether to render OCR text on highlights.
             resolution: Resolution in DPI for base page image (default: scale * 72).
             include_highlights: Whether to render highlights.
+            exclusions: If 'mask', excluded regions will be whited out on the image.
+                        (default: None).
             **kwargs: Additional parameters for pdfplumber.to_image.
         Returns:
             PIL Image of the page, or None if rendering fails.
         """
         image = None
+        render_resolution = resolution if resolution is not None else scale * 72
         try:
             if include_highlights:
                 # Delegate rendering to the central service
                 image = self._highlighter.render_page(
                     page_index=self.index,
-                    scale=scale,
+                    scale=scale,  # Note: scale is used by highlighter internally for drawing
                     labels=labels,
                     legend_position=legend_position,
                     render_ocr=render_ocr,
-                    resolution=resolution,
+                    resolution=render_resolution,  # Pass the calculated resolution
                     **kwargs,
                 )
             else:
                 # Get the base page image directly from pdfplumber if no highlights needed
-                render_resolution = resolution if resolution is not None else scale * 72
                 # Use the underlying pdfplumber page object
                 img_object = self._page.to_image(resolution=render_resolution, **kwargs)
                 # Access the PIL image directly (assuming pdfplumber structure)
@@ -1287,6 +1293,53 @@ class Page:
         if image is None:
             return None
+        # --- Apply exclusion masking if requested ---
+        if exclusions == "mask" and self._exclusions:
+            try:
+                # Ensure image is mutable (RGB or RGBA)
+                if image.mode not in ("RGB", "RGBA"):
+                    image = image.convert("RGB")
+                exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
+                if exclusion_regions:
+                    draw = ImageDraw.Draw(image)
+                    # Calculate the scaling factor used for the image
+                    # Base image was rendered at render_resolution (DPI)
+                    # pdfplumber default is 72 DPI
+                    # Scale factor = (pixels / inch) / (points / inch) = DPI / 72
+                    img_scale = render_resolution / 72.0
+                    for region in exclusion_regions:
+                        # Convert PDF points (x0, top, x1, bottom) to image pixels
+                        img_x0 = region.x0 * img_scale
+                        img_top = region.top * img_scale
+                        img_x1 = region.x1 * img_scale
+                        img_bottom = region.bottom * img_scale
+                        # Draw a white rectangle over the excluded area
+                        # Ensure coordinates are within image bounds (though region should be)
+                        img_coords = (
+                            max(0, img_x0),
+                            max(0, img_top),
+                            min(image.width, img_x1),
+                            min(image.height, img_bottom),
+                        )
+                        if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
+                            draw.rectangle(img_coords, fill="white")
+                        else:
+                            logger.warning(
+                                f"Skipping invalid exclusion rect for masking: {img_coords}"
+                            )
+                    del draw  # Release drawing context
+            except Exception as mask_error:
+                logger.error(
+                    f"Error applying exclusion mask to page {self.index}: {mask_error}",
+                    exc_info=True,
+                )
+                # Decide if you want to return None or continue without mask
+                # For now, continue without mask
         # Resize the final image if width is provided
         if width is not None and width > 0 and image.width > 0:
             aspect_ratio = image.height / image.width
@@ -1328,20 +1381,34 @@ class Page:
         languages: Optional[List[str]] = None,
         min_confidence: Optional[float] = None,
         device: Optional[str] = None,
+        resolution: Optional[int] = None,
+        detect_only: bool = False,
+        apply_exclusions: bool = True,
     ) -> "Page":
         """
         Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
+        Args:
+            engine: Name of the OCR engine.
+            options: Engine-specific options object or dict.
+            languages: List of engine-specific language codes.
+            min_confidence: Minimum confidence threshold.
+            device: Device to run OCR on.
+            resolution: DPI resolution for rendering page image before OCR.
+            apply_exclusions: If True (default), render page image for OCR
+                              with excluded areas masked (whited out).
         Returns:
             List of created TextElements derived from OCR results for this page.
         """
         if not hasattr(self._parent, "apply_ocr"):
             logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
-            return []
+            return []  # Return empty list for consistency
         logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
         try:
             # Delegate to parent PDF, targeting only this page's index
+            # Pass all relevant parameters through, including apply_exclusions
             self._parent.apply_ocr(
                 pages=[self.index],
                 engine=engine,
@@ -1349,17 +1416,21 @@ class Page:
                 languages=languages,
                 min_confidence=min_confidence,
                 device=device,
+                resolution=resolution,
+                detect_only=detect_only,
+                apply_exclusions=apply_exclusions,
             )
         except Exception as e:
             logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
             return []
         # Return the OCR elements specifically added to this page
-        # Use element manager to retrieve them
         ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
         logger.debug(
             f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
         )
+        # Note: The method is typed to return Page for chaining, but the log indicates
+        # finding elements. Let's stick to returning self for chaining consistency.
         return self
     def extract_ocr_elements(
@@ -1369,10 +1440,22 @@ class Page:
         languages: Optional[List[str]] = None,
         min_confidence: Optional[float] = None,
         device: Optional[str] = None,
+        resolution: Optional[int] = None,
     ) -> List[TextElement]:
         """
         Extract text elements using OCR *without* adding them to the page's elements.
         Uses the shared OCRManager instance.
+        Args:
+            engine: Name of the OCR engine.
+            options: Engine-specific options object or dict.
+            languages: List of engine-specific language codes.
+            min_confidence: Minimum confidence threshold.
+            device: Device to run OCR on.
+            resolution: DPI resolution for rendering page image before OCR.
+        Returns:
+            List of created TextElement objects derived from OCR results for this page.
         """
         if not self._ocr_manager:
             logger.error(
@@ -1381,10 +1464,14 @@ class Page:
             return []
         logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
+        # Determine rendering resolution
+        final_resolution = resolution if resolution is not None else 150  # Default to 150 DPI
+        logger.debug(f"  Using rendering resolution: {final_resolution} DPI")
         try:
-            ocr_scale = getattr(self._parent, "_config", {}).get("ocr_image_scale", 2.0)
-            # Get base image without highlights
-            image = self.to_image(scale=ocr_scale, include_highlights=False)
+            # Get base image without highlights using the determined resolution
+            image = self.to_image(resolution=final_resolution, include_highlights=False)
             if not image:
                 logger.error(f"  Failed to render page {self.number} to image for OCR extraction.")
                 return []
@@ -1393,13 +1480,16 @@ class Page:
             logger.error(f"  Failed to render page {self.number} to image: {e}", exc_info=True)
             return []
-        manager_args = {"images": image, "options": options, "engine": engine}
-        if languages is not None:
-            manager_args["languages"] = languages
-        if min_confidence is not None:
-            manager_args["min_confidence"] = min_confidence
-        if device is not None:
-            manager_args["device"] = device
+        # Prepare arguments for the OCR Manager call
+        manager_args = {
+            "images": image,
+            "engine": engine,
+            "languages": languages,
+            "min_confidence": min_confidence,
+            "device": device,
+            "options": options,
+        }
+        manager_args = {k: v for k, v in manager_args.items() if v is not None}
         logger.debug(
             f"  Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }"
@@ -1415,7 +1505,6 @@ class Page:
                 and isinstance(results_list[0], list)
                 else results_list
             )
             if not isinstance(results, list):
                 logger.error(f"  OCR Manager returned unexpected type: {type(results)}")
                 results = []
@@ -1426,28 +1515,32 @@ class Page:
         # Convert results but DO NOT add to ElementManager
         logger.debug(f"  Converting OCR results to TextElements (extract only)...")
-        # Use a temporary method to create elements without adding them globally
         temp_elements = []
         scale_x = self.width / image.width if image.width else 1
         scale_y = self.height / image.height if image.height else 1
         for result in results:
-            x0, top, x1, bottom = [float(c) for c in result["bbox"]]
-            elem_data = {
-                "text": result["text"],
-                "confidence": result["confidence"],
-                "x0": x0 * scale_x,
-                "top": top * scale_y,
-                "x1": x1 * scale_x,
-                "bottom": bottom * scale_y,
-                "width": (x1 - x0) * scale_x,
-                "height": (bottom - top) * scale_y,
-                "object_type": "text",
-                "source": "ocr",
-                "fontname": "OCR-temp",
-                "size": 10.0,
-                "page_number": self.number,
-            }
-            temp_elements.append(TextElement(elem_data, self))
+            try:  # Added try-except around result processing
+                x0, top, x1, bottom = [float(c) for c in result["bbox"]]
+                elem_data = {
+                    "text": result["text"],
+                    "confidence": result["confidence"],
+                    "x0": x0 * scale_x,
+                    "top": top * scale_y,
+                    "x1": x1 * scale_x,
+                    "bottom": bottom * scale_y,
+                    "width": (x1 - x0) * scale_x,
+                    "height": (bottom - top) * scale_y,
+                    "object_type": "text",  # Using text for temporary elements
+                    "source": "ocr",
+                    "fontname": "OCR-extract",  # Different name for clarity
+                    "size": 10.0,
+                    "page_number": self.number,
+                }
+                temp_elements.append(TextElement(elem_data, self))
+            except (KeyError, ValueError, TypeError) as convert_err:
+                logger.warning(
+                    f"  Skipping invalid OCR result during conversion: {result}. Error: {convert_err}"
+                )
         logger.info(f"  Created {len(temp_elements)} TextElements from OCR (extract only).")
         return temp_elements
@@ -1914,7 +2007,7 @@ class Page:
         Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
         Note: OCR must have been applied to the pages beforehand
-              (e.g., using pdf.apply_ocr()).
+              (e.g., pdf.apply_ocr()).
         Args:
             output_path: Path to save the searchable PDF.
@@ -1929,3 +2022,48 @@ class Page:
         create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
         logger.info(f"Searchable PDF saved to: {output_path_str}")
+    # --- Added correct_ocr method ---
+    def correct_ocr(
+        self,
+        correction_callback: Callable[[Any], Optional[str]],
+    ) -> "Page":  # Return self for chaining
+        """
+        Applies corrections to OCR-generated text elements on this page
+        using a user-provided callback function.
+        Finds text elements on this page whose 'source' attribute starts
+        with 'ocr' and calls the `correction_callback` for each, passing the
+        element itself.
+        The `correction_callback` should contain the logic to:
+        1. Determine if the element needs correction.
+        2. Perform the correction (e.g., call an LLM).
+        3. Return the new text (`str`) or `None`.
+        If the callback returns a string, the element's `.text` is updated.
+        Metadata updates (source, confidence, etc.) should happen within the callback.
+        Args:
+            correction_callback: A function accepting an element and returning
+                                 `Optional[str]` (new text or None).
+        Returns:
+            Self for method chaining.
+        """
+        logger.info(
+            f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'"
+        )
+        # Find OCR elements specifically on this page
+        # Note: We typically want to correct even if the element falls in an excluded area
+        target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
+        # Delegate to the utility function
+        _apply_ocr_correction_to_elements(
+            elements=target_elements,  # Pass the ElementCollection directly
+            correction_callback=correction_callback,
+            caller_info=f"Page({self.number})",  # Pass caller info
+        )
+        return self  # Return self for chaining

natural_pdf/core/pdf.py CHANGED Viewed

@@ -17,6 +17,8 @@ from typing import (  # Added Iterable and TYPE_CHECKING
     Type,
     Union,
 )
+from pathlib import Path
 import pdfplumber
 from PIL import Image
@@ -235,11 +237,16 @@ class PDF:
         self,
         pages: Optional[Union[Iterable[int], range, slice]] = None,
         engine: Optional[str] = None,
-        options: Optional["OCROptions"] = None,
+        # --- Common OCR Parameters (Direct Arguments) ---
         languages: Optional[List[str]] = None,
-        min_confidence: Optional[float] = None,
+        min_confidence: Optional[float] = None,  # Min confidence threshold
         device: Optional[str] = None,
-        # Add other simple mode args if needed
+        resolution: Optional[int] = None,  # DPI for rendering before OCR
+        apply_exclusions: bool = True,  # New parameter
+        detect_only: bool = False,
+        # --- Engine-Specific Options --- Use 'options=' for this
+        options: Optional[Any] = None,  # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
+        # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
     ) -> "PDF":
         """
         Applies OCR to specified pages (or all pages) of the PDF using batch processing.
@@ -250,20 +257,30 @@ class PDF:
         Args:
             pages: An iterable of 0-based page indices (list, range, tuple),
                    a slice object, or None to process all pages.
-            engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
-                    Uses manager's default if None. Ignored if 'options' is provided.
-            options: An specific Options object (e.g., EasyOCROptions) for
-                     advanced configuration. Overrides simple arguments.
-            languages: List of language codes for simple mode.
-            min_confidence: Minimum confidence threshold for simple mode.
-            device: Device string ('cpu', 'cuda', etc.) for simple mode.
+            engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
+                    Uses manager's default ('easyocr') if None.
+            languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
+                       **Must be codes understood by the specific selected engine.**
+                       No mapping is performed. Overrides manager/engine default.
+            min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
+                            Overrides manager/engine default.
+            device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
+                    Overrides manager/engine default.
+            resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
+                        Affects input quality for OCR. Defaults to 150 if not set.
+            apply_exclusions: If True (default), render page image for OCR with
+                              excluded areas masked (whited out). If False, OCR
+                              the raw page image without masking exclusions.
+            detect_only: If True, only detect text bounding boxes, don't perform OCR.
+            options: An engine-specific options object (e.g., EasyOCROptions) or dict
+                     containing parameters specific to the chosen engine.
         Returns:
             Self for method chaining.
         Raises:
-            ValueError: If page indices are invalid or the engine name is invalid.
-            TypeError: If unexpected keyword arguments are provided in simple mode.
+            ValueError: If page indices are invalid.
+            TypeError: If 'options' is not compatible with the engine.
             RuntimeError: If the OCRManager or selected engine is not available.
         """
         if not self._ocr_manager:
@@ -271,7 +288,7 @@ class PDF:
             # Or raise RuntimeError("OCRManager not initialized.")
             return self
-        # --- Determine Target Pages ---
+        # --- Determine Target Pages (unchanged) ---
         target_pages: List[Page] = []
         if pages is None:
             target_pages = self._pages
@@ -295,44 +312,67 @@ class PDF:
         page_numbers = [p.number for p in target_pages]
         logger.info(f"Applying batch OCR to pages: {page_numbers}...")
+        # --- Determine Rendering Resolution ---
+        # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
+        final_resolution = resolution  # Use direct arg if provided
+        if final_resolution is None:
+            final_resolution = getattr(self, "_config", {}).get("resolution", 150)
+        logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
         # --- Render Images for Batch ---
         images_pil: List[Image.Image] = []
         page_image_map: List[Tuple[Page, Image.Image]] = []  # Store page and its image
-        logger.info(f"Rendering {len(target_pages)} pages to images...")
+        logger.info(
+            f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})..."
+        )
         failed_page_num = "unknown"  # Keep track of potentially failing page
         try:
-            ocr_scale = getattr(self, "_config", {}).get("ocr_image_scale", 2.0)
             for i, page in enumerate(target_pages):
                 failed_page_num = page.number  # Update current page number in case of error
                 logger.debug(f"  Rendering page {page.number} (index {page.index})...")
-                # Use page.to_image but ensure highlights are off for OCR base image
-                img = page.to_image(scale=ocr_scale, include_highlights=False)
+                # Use the determined final_resolution and apply exclusions if requested
+                to_image_kwargs = {
+                    "resolution": final_resolution,
+                    "include_highlights": False,
+                    "exclusions": "mask" if apply_exclusions else None,
+                }
+                img = page.to_image(**to_image_kwargs)
+                if img is None:
+                    logger.error(f"  Failed to render page {page.number} to image.")
+                    # Decide how to handle: skip page, raise error? For now, skip.
+                    continue  # Skip this page if rendering failed
                 images_pil.append(img)
                 page_image_map.append((page, img))  # Store pair
         except Exception as e:
             logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
             raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
-        if not images_pil:
+        if not images_pil or not page_image_map:
             logger.error("No images were successfully rendered for batch OCR.")
             return self
         # --- Prepare Arguments for Manager ---
-        manager_args = {"images": images_pil, "options": options, "engine": engine}
-        simple_args = {}
-        if languages is not None:
-            simple_args["languages"] = languages
-        if min_confidence is not None:
-            simple_args["min_confidence"] = min_confidence
-        if device is not None:
-            simple_args["device"] = device
-        manager_args.update(simple_args)  # Add simple args if options not provided
+        # Pass common args directly, engine-specific via options
+        manager_args = {
+            "images": images_pil,
+            "engine": engine,
+            "languages": languages,
+            "min_confidence": min_confidence,  # Use the renamed parameter
+            "device": device,
+            "options": options,
+            "detect_only": detect_only,
+            # Note: resolution is used for rendering, not passed to OCR manager directly
+        }
+        # Filter out None values so manager can use its defaults
+        manager_args = {k: v for k, v in manager_args.items() if v is not None}
         # --- Call OCR Manager for Batch Processing ---
-        logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
+        logger.info(
+            f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ..."
+        )
         try:
-            # The manager's apply_ocr handles the batch input and returns List[List[Dict]]
+            # Manager's apply_ocr signature needs to accept common args directly
             batch_results = self._ocr_manager.apply_ocr(**manager_args)
             if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
@@ -341,16 +381,15 @@ class PDF:
                     f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
                     f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
                 )
-                # Handle error - maybe return early or try processing valid parts?
-                return self  # Return self without adding elements
+                return self
             logger.info("OCR Manager batch processing complete.")
         except Exception as e:
             logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
-            return self  # Return self without adding elements
+            return self
-        # --- Distribute Results and Add Elements to Pages ---
+        # --- Distribute Results and Add Elements to Pages (unchanged) ---
         logger.info("Adding OCR results to respective pages...")
         total_elements_added = 0
         for i, (page, img) in enumerate(page_image_map):
@@ -362,10 +401,7 @@ class PDF:
                 continue
             logger.debug(f"  Processing {len(results_for_page)} results for page {page.number}...")
-            # Use the page's element manager to create elements from its results
-            # Changed from page._create_text_elements_from_ocr to use element_mgr
             try:
-                # Calculate scale factors based on rendered image vs page dims
                 img_scale_x = page.width / img.width if img.width > 0 else 1
                 img_scale_y = page.height / img.height if img.height > 0 else 1
                 elements = page._element_mgr.create_text_elements_from_ocr(
@@ -373,7 +409,6 @@ class PDF:
                 )
                 if elements:
-                    # Note: element_mgr.create_text_elements_from_ocr already adds them
                     total_elements_added += len(elements)
                     logger.debug(f"  Added {len(elements)} OCR TextElements to page {page.number}.")
                 else:
@@ -382,7 +417,6 @@ class PDF:
                 logger.error(
                     f"  Error adding OCR elements to page {page.number}: {e}", exc_info=True
                 )
-                # Continue to next page
         logger.info(
             f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
@@ -907,6 +941,87 @@ class PDF:
                 f"Search within index failed for PDF '{self.path}'. See logs for details."
             ) from e
+    def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
+        """
+        Exports OCR results from this PDF into a correction task package (zip file).
+        Args:
+            output_zip_path: The path to save the output zip file.
+            **kwargs: Additional arguments passed to create_correction_task_package
+                      (e.g., image_render_scale, overwrite).
+        """
+        try:
+            from natural_pdf.utils.packaging import create_correction_task_package
+            create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
+        except ImportError:
+            logger.error(
+                "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
+            )
+            # Or raise
+        except Exception as e:
+            logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
+            raise  # Re-raise the exception from the utility function
+    def correct_ocr(
+        self,
+        correction_callback: Callable[[Any], Optional[str]],
+        pages: Optional[Union[Iterable[int], range, slice]] = None,
+    ) -> "PDF":  # Return self for chaining
+        """
+        Applies corrections to OCR-generated text elements using a callback function,
+        delegating the core work to the `Page.correct_ocr` method.
+        Args:
+            correction_callback: A function that accepts a single argument (an element
+                                object) and returns `Optional[str]`. It returns the
+                                corrected text string if an update is needed, otherwise None.
+            pages: Optional page indices/slice to limit the scope of correction
+                (default: all pages).
+        Returns:
+            Self for method chaining.
+        """
+        # Determine target pages
+        target_page_indices: List[int] = []
+        if pages is None:
+            target_page_indices = list(range(len(self._pages)))
+        elif isinstance(pages, slice):
+            target_page_indices = list(range(*pages.indices(len(self._pages))))
+        elif hasattr(pages, "__iter__"):
+            try:
+                target_page_indices = [int(i) for i in pages]
+                # Validate indices
+                for idx in target_page_indices:
+                    if not (0 <= idx < len(self._pages)):
+                        raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
+            except (IndexError, TypeError, ValueError) as e:
+                raise ValueError(
+                    f"Invalid page index or type provided in 'pages': {pages}. Error: {e}"
+                ) from e
+        else:
+            raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
+        if not target_page_indices:
+            logger.warning("No pages selected for OCR correction.")
+            return self
+        logger.info(
+            f"Starting OCR correction process via Page delegation for pages: {target_page_indices}"
+        )
+        # Iterate through target pages and call their correct_ocr method
+        for page_idx in target_page_indices:
+            page = self._pages[page_idx]
+            try:
+                page.correct_ocr(correction_callback=correction_callback)
+            except Exception as e:
+                logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
+                # Optionally re-raise or just log and continue
+        logger.info(f"OCR correction process finished for requested pages.")
+        return self
     def __len__(self) -> int:
         """Return the number of pages in the PDF."""
         # Ensure _pages is initialized
@@ -967,7 +1082,6 @@ class PDF:
         """Context manager exit."""
         self.close()
-# --- Added TYPE_CHECKING import (if not already present) ---
-if TYPE_CHECKING:
-    from pathlib import Path  # Assuming Path is used for type hint
+    # --- Indexable Protocol Methods --- Needed for search/sync
+    def get_id(self) -> str:
+        return self.path

natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

natural-pdf 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl