PyPI - natural-pdf - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

natural-pdf 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

natural_pdf/__init__.py +29 -40
natural_pdf/analyzers/text_options.py +9 -1
natural_pdf/analyzers/text_structure.py +371 -58
natural_pdf/classification/manager.py +1 -1
natural_pdf/core/element_manager.py +11 -1
natural_pdf/core/highlighting_service.py +120 -40
natural_pdf/core/page.py +20 -18
natural_pdf/core/pdf.py +146 -13
natural_pdf/elements/base.py +17 -0
natural_pdf/elements/collections.py +374 -30
natural_pdf/elements/region.py +45 -14
natural_pdf/exporters/data/__init__.py +0 -0
natural_pdf/exporters/data/pdf.ttf +0 -0
natural_pdf/exporters/data/sRGB.icc +0 -0
natural_pdf/exporters/hocr.py +519 -0
natural_pdf/exporters/hocr_font.py +136 -0
natural_pdf/exporters/original_pdf.py +127 -0
natural_pdf/exporters/searchable_pdf.py +2 -12
natural_pdf/ocr/engine_surya.py +1 -1
natural_pdf/search/__init__.py +65 -52
natural_pdf/search/lancedb_search_service.py +325 -0
natural_pdf/search/numpy_search_service.py +255 -0
natural_pdf/search/searchable_mixin.py +25 -71
natural_pdf/widgets/viewer.py +22 -31
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
natural_pdf/search/haystack_search_service.py +0 -687
natural_pdf/search/haystack_utils.py +0 -474
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -407,7 +407,17 @@ class ElementManager:
                     char_dict_data = ocr_char_dict  # Use the one we already created
                     char_dict_data["object_type"] = "char"  # Mark as char type
                     char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
-                    self._elements["chars"].append(char_dict_data)  # Append the dictionary
+                    # Create a TextElement for the char representation
+                    # Ensure _char_dicts is handled correctly by TextElement constructor
+                    # For an OCR word represented as a char, its _char_dicts can be a list containing its own data
+                    char_element_specific_data = char_dict_data.copy()
+                    char_element_specific_data["_char_dicts"] = [char_dict_data.copy()]
+                    ocr_char_as_element = TextElement(char_element_specific_data, self._page)
+                    self._elements["chars"].append(
+                        ocr_char_as_element
+                    )  # Append TextElement instance
             except (KeyError, ValueError, TypeError) as e:
                 logger.error(f"Failed to process OCR result: {result}. Error: {e}", exc_info=True)

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -611,13 +611,13 @@ class HighlightingService:
         Args:
             page_index: The 0-based index of the page to render.
-            scale: Scale factor for rendering highlights.
+            scale: Scale factor for rendering highlights if width/height/resolution not in kwargs.
             labels: Whether to include a legend for highlights.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text on the image.
-            resolution: Optional resolution (DPI) for the base page image.
-                       Defaults to scale * 72.
-            kwargs: Additional keyword arguments for pdfplumber's page.to_image.
+            resolution: Optional resolution (DPI) for the base page image if width/height not in kwargs.
+                       Defaults to scale * 72 if not otherwise specified.
+            kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
         Returns:
             A PIL Image object of the rendered page, or None if rendering fails.
@@ -626,34 +626,81 @@ class HighlightingService:
             logger.error(f"Invalid page index {page_index} for rendering.")
             return None
-        page = self._pdf[page_index]
+        page_obj = self._pdf[page_index] # Renamed to avoid conflict
         highlights_on_page = self.get_highlights_for_page(page_index)
-        render_resolution = resolution if resolution is not None else scale * 72
-        base_image = render_plain_page(page, render_resolution)
-        base_image = base_image.convert("RGBA")
-        logger.debug(
-            f"Base image for page {page_index} rendered with resolution {render_resolution}."
-        )
+        to_image_args = kwargs.copy()
+        actual_scale_x = None
+        actual_scale_y = None
+        if "width" in to_image_args and to_image_args["width"] is not None:
+            logger.debug(f"Rendering page {page_index} with width={to_image_args['width']}.")
+            if "height" in to_image_args: to_image_args.pop("height", None)
+            # Actual scale will be calculated after image creation
+        elif "height" in to_image_args and to_image_args["height"] is not None:
+            logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
+            # Actual scale will be calculated after image creation
+        else:
+            # Use explicit resolution from kwargs if present, then the resolution param, then scale
+            render_resolution = to_image_args.pop("resolution", resolution) # Use and remove from kwargs if present
+            if render_resolution is None:
+                render_resolution = scale * 72
+            to_image_args["resolution"] = render_resolution # Add it back for the call
+            actual_scale_x = render_resolution / 72.0
+            actual_scale_y = render_resolution / 72.0
+            logger.debug(f"Rendering page {page_index} with resolution {render_resolution} (scale: {actual_scale_x:.2f}).")
+        try:
+            # base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
+            img_object = page_obj._page.to_image(**to_image_args)
+            base_image_pil = (
+                img_object.annotated
+                if hasattr(img_object, "annotated")
+                else img_object._repr_png_()
+            )
+            if isinstance(base_image_pil, bytes):
+                from io import BytesIO
+                base_image_pil = Image.open(BytesIO(base_image_pil))
+            base_image_pil = base_image_pil.convert("RGBA") # Ensure RGBA for renderer
+            logger.debug(
+                f"Base image for page {page_index} rendered. Size: {base_image_pil.size}."
+            )
+            if actual_scale_x is None or actual_scale_y is None: # If not set by resolution path
+                if page_obj.width > 0:
+                    actual_scale_x = base_image_pil.width / page_obj.width
+                else:
+                    actual_scale_x = scale # Fallback
+                if page_obj.height > 0:
+                    actual_scale_y = base_image_pil.height / page_obj.height
+                else:
+                    actual_scale_y = scale # Fallback
+                logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
+        except Exception as e:
+            logger.error(f"Error creating base image for page {page_index}: {e}", exc_info=True)
+            return None
+        renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
         # --- Render Highlights ---
         rendered_image: Image.Image
         if highlights_on_page:
             renderer = HighlightRenderer(
-                page=page,
-                base_image=base_image,
+                page=page_obj,
+                base_image=base_image_pil,
                 highlights=highlights_on_page,
-                scale=scale,
+                scale=renderer_scale, # Use the determined actual scale
                 render_ocr=render_ocr,
             )
             rendered_image = renderer.render()
         else:
             if render_ocr:
-                # Still render OCR even if no highlights
-                renderer = HighlightRenderer(page, base_image, [], scale, True)
+                # Still render OCR even if no highlights, using the determined actual scale
+                renderer = HighlightRenderer(page_obj, base_image_pil, [], renderer_scale, True)
                 rendered_image = renderer.render()
             else:
-                rendered_image = base_image  # No highlights, no OCR requested
+                rendered_image = base_image_pil  # No highlights, no OCR requested
         # --- Add Legend (Based ONLY on this page's highlights) ---
         if labels:
@@ -697,12 +744,12 @@ class HighlightingService:
         Args:
             page_index: Index of the page to render.
             temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
-            scale: Scale factor for rendering.
+            scale: Original scale factor for rendering, used if width/height are not provided.
             labels: Whether to include a legend.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text.
-            resolution: Resolution for base page image rendering.
-            **kwargs: Additional args for pdfplumber's to_image.
+            resolution: Resolution for base page image rendering if width/height not used.
+            **kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
         Returns:
             PIL Image of the preview, or None if rendering fails.
@@ -711,35 +758,64 @@ class HighlightingService:
             logger.error(f"Invalid page index {page_index} for render_preview.")
             return None
-        page = self._pdf.pages[page_index]
-        render_resolution = resolution if resolution is not None else scale * 72
+        page_obj = self._pdf.pages[page_index]
+        to_image_args = kwargs.copy()
+        actual_scale_x = None
+        actual_scale_y = None
+        # Determine arguments for page._page.to_image()
+        if "width" in to_image_args and to_image_args["width"] is not None:
+            logger.debug(f"Rendering preview for page {page_index} with width={to_image_args['width']}.")
+            # Resolution is implicitly handled by pdfplumber when width is set
+            if "height" in to_image_args:
+                to_image_args.pop("height", None)
+            # after image is created, we will calculate actual_scale_x and actual_scale_y
+        elif "height" in to_image_args and to_image_args["height"] is not None:
+            logger.debug(f"Rendering preview for page {page_index} with height={to_image_args['height']}.")
+            # Resolution is implicitly handled by pdfplumber when height is set
+            # after image is created, we will calculate actual_scale_x and actual_scale_y
+        else:
+            # Neither width nor height is provided, use resolution or scale.
+            render_resolution = resolution if resolution is not None else scale * 72
+            to_image_args["resolution"] = render_resolution
+            actual_scale_x = render_resolution / 72.0
+            actual_scale_y = render_resolution / 72.0
+            logger.debug(f"Rendering preview for page {page_index} with resolution={render_resolution} (scale: {actual_scale_x:.2f}).")
         try:
-            # Get base image from pdfplumber using the Page object's underlying _page
-            img_object = page._page.to_image(resolution=render_resolution, **kwargs)
-            base_image = (
+            img_object = page_obj._page.to_image(**to_image_args)
+            base_image_pil = (
                 img_object.annotated
                 if hasattr(img_object, "annotated")
                 else img_object._repr_png_()
             )
-            if isinstance(base_image, bytes):
+            if isinstance(base_image_pil, bytes):
                 from io import BytesIO
+                base_image_pil = Image.open(BytesIO(base_image_pil))
+            base_image_pil = base_image_pil.convert("RGB")
-                base_image = Image.open(BytesIO(base_image))
-            base_image = base_image.convert("RGB")  # Ensure consistent format
+            # If scale was not determined by resolution, calculate it now from base_image_pil dimensions
+            if actual_scale_x is None or actual_scale_y is None:
+                if page_obj.width > 0:
+                    actual_scale_x = base_image_pil.width / page_obj.width
+                else:
+                    actual_scale_x = scale # Fallback to original scale
+                if page_obj.height > 0:
+                    actual_scale_y = base_image_pil.height / page_obj.height
+                else:
+                    actual_scale_y = scale # Fallback to original scale
+                logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})")
             # Convert temporary highlight dicts to Highlight objects
-            # Note: Colors/labels should be determined *here* for temporary preview
             preview_highlights = []
             for hl_data in temporary_highlights:
-                # Determine the final color using the service logic
                 final_color = self._determine_highlight_color(
                     color_input=hl_data.get("color"),
                     label=hl_data.get("label"),
                     use_color_cycling=hl_data.get("use_color_cycling", False),
                 )
-                # Extract potential attributes to draw
                 attrs_to_draw = {}
                 element = hl_data.get("element")
                 include_attrs = hl_data.get("include_attrs")
@@ -753,25 +829,29 @@ class HighlightingService:
                             logger.warning(
                                 f"Attribute '{attr_name}' not found on element {element}"
                             )
-                # Add highlight if geometry exists
                 if hl_data.get("bbox") or hl_data.get("polygon"):
                     preview_highlights.append(
                         Highlight(
                             page_index=hl_data["page_index"],
                             bbox=hl_data.get("bbox"),
                             polygon=hl_data.get("polygon"),
-                            color=final_color,  # Use the determined color
+                            color=final_color,
                             label=hl_data.get("label"),
                             attributes=attrs_to_draw,
                         )
                     )
-            # Render only these highlights
-            renderer = HighlightRenderer(page, base_image, preview_highlights, scale, render_ocr)
+            # Use the calculated actual_scale_x for the HighlightRenderer
+            # Assuming HighlightRenderer can handle a single scale or we adapt it.
+            # For now, pdfplumber usually maintains aspect ratio, so one scale should be okay.
+            # If not, HighlightRenderer needs to accept scale_x and scale_y.
+            # We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
+            # or if not, it's a reasonable approximation for highlight scaling.
+            renderer_scale = actual_scale_x
+            renderer = HighlightRenderer(page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr)
             rendered_image = renderer.render()
-            # Create legend only from temporary highlights
             legend = None
             if labels:
                 preview_labels = {h.label: h.color for h in preview_highlights if h.label}
@@ -781,7 +861,7 @@ class HighlightingService:
                         rendered_image, legend, position=legend_position
                     )
                 else:
-                    final_image = rendered_image  # No legend needed
+                    final_image = rendered_image
             else:
                 final_image = rendered_image

natural_pdf/core/page.py CHANGED Viewed

@@ -40,10 +40,10 @@ if TYPE_CHECKING:
     from natural_pdf.elements.base import Element
     from natural_pdf.elements.collections import ElementCollection
-# New Imports
+# # New Imports
 import itertools
-# Deskew Imports (Conditional)
+# # Deskew Imports (Conditional)
 import numpy as np
 from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -55,7 +55,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
 from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
 from natural_pdf.classification.manager import ClassificationManager  # For type hint
-# --- Classification Imports --- #
+# # --- Classification Imports --- #
 from natural_pdf.classification.mixin import ClassificationMixin  # Import classification mixin
 from natural_pdf.core.element_manager import ElementManager
 from natural_pdf.elements.base import Element  # Import base element
@@ -66,7 +66,7 @@ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.qa import DocumentQA, get_qa_engine
 from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
-# Import new utils
+# # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
 from natural_pdf.widgets import InteractiveViewerWidget
 from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
@@ -210,7 +210,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def add_exclusion(
         self,
-        exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
+        exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
         label: Optional[str] = None,
     ) -> "Page":
         """
@@ -274,7 +274,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return self
-    def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
+    def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
         """
         Add a region to the page.
@@ -305,7 +305,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return self
-    def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
+    def add_regions(self, regions: List["Region"], prefix: Optional[str] = None) -> "Page":
         """
         Add multiple regions to the page.
@@ -327,7 +327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return self
-    def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
+    def _get_exclusion_regions(self, include_callable=True, debug=False) -> List["Region"]:
         """
         Get all exclusion regions for this page.
         Assumes self._exclusions contains tuples of (callable/Region, label).
@@ -1349,7 +1349,9 @@ class Page(ClassificationMixin, ExtractionMixin):
         self._highlighter.clear_page(self.index)
         return self
-    def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
+    def analyze_text_styles(
+        self, options: Optional[TextStyleOptions] = None
+    ) -> "ElementCollection":
         """
         Analyze text elements by style, adding attributes directly to elements.
@@ -1520,7 +1522,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def _create_text_elements_from_ocr(
         self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
-    ) -> List[TextElement]:
+    ) -> List["TextElement"]:
         """DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
         logger.warning(
             "_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
@@ -1532,7 +1534,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def apply_ocr(
         self,
         engine: Optional[str] = None,
-        options: Optional[OCROptions] = None,
+        options: Optional["OCROptions"] = None,
         languages: Optional[List[str]] = None,
         min_confidence: Optional[float] = None,
         device: Optional[str] = None,
@@ -1597,12 +1599,12 @@ class Page(ClassificationMixin, ExtractionMixin):
     def extract_ocr_elements(
         self,
         engine: Optional[str] = None,
-        options: Optional[OCROptions] = None,
+        options: Optional["OCROptions"] = None,
         languages: Optional[List[str]] = None,
         min_confidence: Optional[float] = None,
         device: Optional[str] = None,
         resolution: Optional[int] = None,
-    ) -> List[TextElement]:
+    ) -> List["TextElement"]:
         """
         Extract text elements using OCR *without* adding them to the page's elements.
         Uses the shared OCRManager instance.
@@ -1716,7 +1718,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         return (self._page.width, self._page.height)
     @property
-    def layout_analyzer(self) -> LayoutAnalyzer:
+    def layout_analyzer(self) -> "LayoutAnalyzer":
         """Get or create the layout analyzer for this page."""
         if self._layout_analyzer is None:
             if not self._layout_manager:
@@ -1728,7 +1730,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def analyze_layout(
         self,
         engine: Optional[str] = None,
-        options: Optional[LayoutOptions] = None,
+        options: Optional["LayoutOptions"] = None,
         confidence: Optional[float] = None,
         classes: Optional[List[str]] = None,
         exclude_classes: Optional[List[str]] = None,
@@ -1736,7 +1738,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         existing: str = "replace",
         model_name: Optional[str] = None,
         client: Optional[Any] = None,  # Add client parameter
-    ) -> ElementCollection[Region]:
+    ) -> "ElementCollection[Region]":
         """
         Analyze the page layout using the configured LayoutManager.
         Adds detected Region objects to the page's element manager.
@@ -1813,7 +1815,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def get_section_between(
         self, start_element=None, end_element=None, boundary_inclusion="both"
-    ) -> Optional[Region]:  # Return Optional
+    ) -> Optional["Region"]:  # Return Optional
         """
         Get a section between two elements on this page.
         """
@@ -2130,7 +2132,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
             logger.error(
                 "Interactive viewer requires optional dependencies ('ipywidgets'). "
-                "Install with `pip install natural-pdf[interactive]`"
+                "Install with `pip install natural-pdf[viewer]`"
             )
             # raise ImportError("ipywidgets not found.") # Option 1: Raise error
             return None  # Option 2: Return None gracefully

natural_pdf/core/pdf.py CHANGED Viewed

@@ -61,6 +61,15 @@ except ImportError:
         )
+try:
+    from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
+except ImportError:
+    create_searchable_pdf = None
+try:
+    from natural_pdf.exporters.original_pdf import create_original_pdf
+except ImportError:
+    create_original_pdf = None
 logger = logging.getLogger("natural_pdf.core.pdf")
 tqdm = get_tqdm()
@@ -260,7 +269,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         return self
     def add_exclusion(
-        self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
+        self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
     ) -> "PDF":
         """
         Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
@@ -468,7 +477,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         return self
     def add_region(
-        self, region_func: Callable[["Page"], Optional[Region]], name: str = None
+        self, region_func: Callable[["Page"], Optional["Region"]], name: str = None
     ) -> "PDF":
         """
         Add a region function to the PDF.
@@ -769,23 +778,137 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
         """
+        DEPRECATED: Use save_pdf(..., ocr=True) instead.
         Saves the PDF with an OCR text layer, making content searchable.
-        Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
+        Requires optional dependencies. Install with: pip install \"natural-pdf[ocr-export]\"
         Args:
             output_path: Path to save the searchable PDF
             dpi: Resolution for rendering and OCR overlay
             **kwargs: Additional keyword arguments passed to the exporter
-            output_path: Path to save the searchable PDF
-            dpi: Resolution for rendering and OCR overlay
-            **kwargs: Additional keyword arguments passed to the exporter
         """
-        from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
+        logger.warning(
+            "PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
+        )
+        if create_searchable_pdf is None:
+            raise ImportError(
+                "Saving searchable PDF requires 'pikepdf'. "
+                'Install with: pip install "natural-pdf[ocr-export]"'
+            )
         output_path_str = str(output_path)
+        # Call the exporter directly, passing self (the PDF instance)
         create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
-        logger.info(f"Searchable PDF saved to: {output_path_str}")
+        # Logger info is handled within the exporter now
+        # logger.info(f"Searchable PDF saved to: {output_path_str}")
+    def save_pdf(
+        self,
+        output_path: Union[str, Path],
+        ocr: bool = False,
+        original: bool = False,
+        dpi: int = 300,
+    ):
+        """
+        Saves the PDF object (all its pages) to a new file.
+        Choose one saving mode:
+        - `ocr=True`: Creates a new, image-based PDF using OCR results from all pages.
+          Text generated during the natural-pdf session becomes searchable,
+          but original vector content is lost. Requires 'ocr-export' extras.
+        - `original=True`: Saves a copy of the original PDF file this object represents.
+          Any OCR results or analyses from the natural-pdf session are NOT included.
+          If the PDF was opened from an in-memory buffer, this mode may not be suitable.
+          Requires 'ocr-export' extras.
+        Args:
+            output_path: Path to save the new PDF file.
+            ocr: If True, save as a searchable, image-based PDF using OCR data.
+            original: If True, save the original source PDF content.
+            dpi: Resolution (dots per inch) used only when ocr=True.
+        Raises:
+            ValueError: If the PDF has no pages, if neither or both 'ocr'
+                        and 'original' are True.
+            ImportError: If required libraries are not installed for the chosen mode.
+            RuntimeError: If an unexpected error occurs during saving.
+        """
+        if not self.pages:
+            raise ValueError("Cannot save an empty PDF object.")
+        if not (ocr ^ original):  # XOR: exactly one must be true
+            raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
+        output_path_obj = Path(output_path)
+        output_path_str = str(output_path_obj)
+        if ocr:
+            has_vector_elements = False
+            for page in self.pages:
+                if (
+                    hasattr(page, "rects")
+                    and page.rects
+                    or hasattr(page, "lines")
+                    and page.lines
+                    or hasattr(page, "curves")
+                    and page.curves
+                    or (
+                        hasattr(page, "chars")
+                        and any(getattr(el, "source", None) != "ocr" for el in page.chars)
+                    )
+                    or (
+                        hasattr(page, "words")
+                        and any(getattr(el, "source", None) != "ocr" for el in page.words)
+                    )
+                ):
+                    has_vector_elements = True
+                    break
+            if has_vector_elements:
+                logger.warning(
+                    "Warning: Saving with ocr=True creates an image-based PDF. "
+                    "Original vector elements (rects, lines, non-OCR text/chars) "
+                    "will not be preserved in the output file."
+                )
+            logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
+            try:
+                # Delegate to the searchable PDF exporter, passing self (PDF instance)
+                create_searchable_pdf(self, output_path_str, dpi=dpi)
+            except Exception as e:
+                raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
+        elif original:
+            if create_original_pdf is None:
+                raise ImportError(
+                    "Saving with original=True requires 'pikepdf'. "
+                    'Install with: pip install "natural-pdf[ocr-export]"'
+                )
+            # Optional: Add warning about losing OCR data similar to PageCollection
+            has_ocr_elements = False
+            for page in self.pages:
+                if hasattr(page, "find_all"):
+                    ocr_text_elements = page.find_all("text[source=ocr]")
+                    if ocr_text_elements:
+                        has_ocr_elements = True
+                        break
+                elif hasattr(page, "words"):  # Fallback
+                    if any(getattr(el, "source", None) == "ocr" for el in page.words):
+                        has_ocr_elements = True
+                        break
+            if has_ocr_elements:
+                logger.warning(
+                    "Warning: Saving with original=True preserves original page content. "
+                    "OCR text generated in this session will not be included in the saved file."
+                )
+            logger.info(f"Saving original PDF content to: {output_path_str}")
+            try:
+                # Delegate to the original PDF exporter, passing self (PDF instance)
+                create_original_pdf(self, output_path_str)
+            except Exception as e:
+                # Re-raise exception from exporter
+                raise e
     def ask(
         self,
@@ -850,9 +973,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     def search_within_index(
         self,
-        query: Union[str, Path, Image.Image, Region],
-        search_service: SearchServiceProtocol,
-        options: Optional[SearchOptions] = None,
+        query: Union[str, Path, Image.Image, "Region"],
+        search_service: "SearchServiceProtocol",
+        options: Optional["SearchOptions"] = None,
     ) -> List[Dict[str, Any]]:
         """
         Finds relevant documents from this PDF within a search index.
@@ -1109,6 +1232,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         """Context manager exit."""
         self.close()
+    def __repr__(self) -> str:
+        """Return a string representation of the PDF object."""
+        if not hasattr(self, "_pages"):
+            page_count_str = "uninitialized"
+        else:
+            page_count_str = str(len(self._pages))
+        source_info = getattr(self, "source_path", "unknown source")
+        return f"<PDF source='{source_info}' pages={page_count_str}>"
     def get_id(self) -> str:
         """Get unique identifier for this PDF."""
         """Get unique identifier for this PDF."""
@@ -1282,7 +1415,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             except ImportError:
                 raise ImportError(
                     "Classification dependencies missing. "
-                    'Install with: pip install "natural-pdf[classification]"'
+                    'Install with: pip install "natural-pdf[core-ml]"'
                 )
             raise ClassificationError("ClassificationManager not available.")

natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

natural-pdf 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl