natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/__init__.py +29 -40
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +20 -18
  8. natural_pdf/core/pdf.py +146 -13
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +374 -30
  11. natural_pdf/elements/region.py +45 -14
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +519 -0
  16. natural_pdf/exporters/hocr_font.py +136 -0
  17. natural_pdf/exporters/original_pdf.py +127 -0
  18. natural_pdf/exporters/searchable_pdf.py +2 -12
  19. natural_pdf/ocr/engine_surya.py +1 -1
  20. natural_pdf/search/__init__.py +65 -52
  21. natural_pdf/search/lancedb_search_service.py +325 -0
  22. natural_pdf/search/numpy_search_service.py +255 -0
  23. natural_pdf/search/searchable_mixin.py +25 -71
  24. natural_pdf/widgets/viewer.py +22 -31
  25. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
  26. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
  27. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  28. natural_pdf/search/haystack_search_service.py +0 -687
  29. natural_pdf/search/haystack_utils.py +0 -474
  30. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -407,7 +407,17 @@ class ElementManager:
407
407
  char_dict_data = ocr_char_dict # Use the one we already created
408
408
  char_dict_data["object_type"] = "char" # Mark as char type
409
409
  char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
410
- self._elements["chars"].append(char_dict_data) # Append the dictionary
410
+
411
+ # Create a TextElement for the char representation
412
+ # Ensure _char_dicts is handled correctly by TextElement constructor
413
+ # For an OCR word represented as a char, its _char_dicts can be a list containing its own data
414
+ char_element_specific_data = char_dict_data.copy()
415
+ char_element_specific_data["_char_dicts"] = [char_dict_data.copy()]
416
+
417
+ ocr_char_as_element = TextElement(char_element_specific_data, self._page)
418
+ self._elements["chars"].append(
419
+ ocr_char_as_element
420
+ ) # Append TextElement instance
411
421
 
412
422
  except (KeyError, ValueError, TypeError) as e:
413
423
  logger.error(f"Failed to process OCR result: {result}. Error: {e}", exc_info=True)
@@ -611,13 +611,13 @@ class HighlightingService:
611
611
 
612
612
  Args:
613
613
  page_index: The 0-based index of the page to render.
614
- scale: Scale factor for rendering highlights.
614
+ scale: Scale factor for rendering highlights if width/height/resolution not in kwargs.
615
615
  labels: Whether to include a legend for highlights.
616
616
  legend_position: Position of the legend.
617
617
  render_ocr: Whether to render OCR text on the image.
618
- resolution: Optional resolution (DPI) for the base page image.
619
- Defaults to scale * 72.
620
- kwargs: Additional keyword arguments for pdfplumber's page.to_image.
618
+ resolution: Optional resolution (DPI) for the base page image if width/height not in kwargs.
619
+ Defaults to scale * 72 if not otherwise specified.
620
+ kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
621
621
 
622
622
  Returns:
623
623
  A PIL Image object of the rendered page, or None if rendering fails.
@@ -626,34 +626,81 @@ class HighlightingService:
626
626
  logger.error(f"Invalid page index {page_index} for rendering.")
627
627
  return None
628
628
 
629
- page = self._pdf[page_index]
629
+ page_obj = self._pdf[page_index] # Renamed to avoid conflict
630
630
  highlights_on_page = self.get_highlights_for_page(page_index)
631
631
 
632
- render_resolution = resolution if resolution is not None else scale * 72
633
- base_image = render_plain_page(page, render_resolution)
634
- base_image = base_image.convert("RGBA")
635
- logger.debug(
636
- f"Base image for page {page_index} rendered with resolution {render_resolution}."
637
- )
632
+ to_image_args = kwargs.copy()
633
+ actual_scale_x = None
634
+ actual_scale_y = None
635
+
636
+ if "width" in to_image_args and to_image_args["width"] is not None:
637
+ logger.debug(f"Rendering page {page_index} with width={to_image_args['width']}.")
638
+ if "height" in to_image_args: to_image_args.pop("height", None)
639
+ # Actual scale will be calculated after image creation
640
+ elif "height" in to_image_args and to_image_args["height"] is not None:
641
+ logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
642
+ # Actual scale will be calculated after image creation
643
+ else:
644
+ # Use explicit resolution from kwargs if present, then the resolution param, then scale
645
+ render_resolution = to_image_args.pop("resolution", resolution) # Use and remove from kwargs if present
646
+ if render_resolution is None:
647
+ render_resolution = scale * 72
648
+ to_image_args["resolution"] = render_resolution # Add it back for the call
649
+ actual_scale_x = render_resolution / 72.0
650
+ actual_scale_y = render_resolution / 72.0
651
+ logger.debug(f"Rendering page {page_index} with resolution {render_resolution} (scale: {actual_scale_x:.2f}).")
652
+
653
+ try:
654
+ # base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
655
+ img_object = page_obj._page.to_image(**to_image_args)
656
+ base_image_pil = (
657
+ img_object.annotated
658
+ if hasattr(img_object, "annotated")
659
+ else img_object._repr_png_()
660
+ )
661
+ if isinstance(base_image_pil, bytes):
662
+ from io import BytesIO
663
+ base_image_pil = Image.open(BytesIO(base_image_pil))
664
+ base_image_pil = base_image_pil.convert("RGBA") # Ensure RGBA for renderer
665
+ logger.debug(
666
+ f"Base image for page {page_index} rendered. Size: {base_image_pil.size}."
667
+ )
668
+
669
+ if actual_scale_x is None or actual_scale_y is None: # If not set by resolution path
670
+ if page_obj.width > 0:
671
+ actual_scale_x = base_image_pil.width / page_obj.width
672
+ else:
673
+ actual_scale_x = scale # Fallback
674
+ if page_obj.height > 0:
675
+ actual_scale_y = base_image_pil.height / page_obj.height
676
+ else:
677
+ actual_scale_y = scale # Fallback
678
+ logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
679
+
680
+ except Exception as e:
681
+ logger.error(f"Error creating base image for page {page_index}: {e}", exc_info=True)
682
+ return None
683
+
684
+ renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
638
685
 
639
686
  # --- Render Highlights ---
640
687
  rendered_image: Image.Image
641
688
  if highlights_on_page:
642
689
  renderer = HighlightRenderer(
643
- page=page,
644
- base_image=base_image,
690
+ page=page_obj,
691
+ base_image=base_image_pil,
645
692
  highlights=highlights_on_page,
646
- scale=scale,
693
+ scale=renderer_scale, # Use the determined actual scale
647
694
  render_ocr=render_ocr,
648
695
  )
649
696
  rendered_image = renderer.render()
650
697
  else:
651
698
  if render_ocr:
652
- # Still render OCR even if no highlights
653
- renderer = HighlightRenderer(page, base_image, [], scale, True)
699
+ # Still render OCR even if no highlights, using the determined actual scale
700
+ renderer = HighlightRenderer(page_obj, base_image_pil, [], renderer_scale, True)
654
701
  rendered_image = renderer.render()
655
702
  else:
656
- rendered_image = base_image # No highlights, no OCR requested
703
+ rendered_image = base_image_pil # No highlights, no OCR requested
657
704
 
658
705
  # --- Add Legend (Based ONLY on this page's highlights) ---
659
706
  if labels:
@@ -697,12 +744,12 @@ class HighlightingService:
697
744
  Args:
698
745
  page_index: Index of the page to render.
699
746
  temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
700
- scale: Scale factor for rendering.
747
+ scale: Original scale factor for rendering, used if width/height are not provided.
701
748
  labels: Whether to include a legend.
702
749
  legend_position: Position of the legend.
703
750
  render_ocr: Whether to render OCR text.
704
- resolution: Resolution for base page image rendering.
705
- **kwargs: Additional args for pdfplumber's to_image.
751
+ resolution: Resolution for base page image rendering if width/height not used.
752
+ **kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
706
753
 
707
754
  Returns:
708
755
  PIL Image of the preview, or None if rendering fails.
@@ -711,35 +758,64 @@ class HighlightingService:
711
758
  logger.error(f"Invalid page index {page_index} for render_preview.")
712
759
  return None
713
760
 
714
- page = self._pdf.pages[page_index]
715
- render_resolution = resolution if resolution is not None else scale * 72
761
+ page_obj = self._pdf.pages[page_index]
762
+
763
+ to_image_args = kwargs.copy()
764
+ actual_scale_x = None
765
+ actual_scale_y = None
766
+
767
+ # Determine arguments for page._page.to_image()
768
+ if "width" in to_image_args and to_image_args["width"] is not None:
769
+ logger.debug(f"Rendering preview for page {page_index} with width={to_image_args['width']}.")
770
+ # Resolution is implicitly handled by pdfplumber when width is set
771
+ if "height" in to_image_args:
772
+ to_image_args.pop("height", None)
773
+ # after image is created, we will calculate actual_scale_x and actual_scale_y
774
+
775
+ elif "height" in to_image_args and to_image_args["height"] is not None:
776
+ logger.debug(f"Rendering preview for page {page_index} with height={to_image_args['height']}.")
777
+ # Resolution is implicitly handled by pdfplumber when height is set
778
+ # after image is created, we will calculate actual_scale_x and actual_scale_y
779
+ else:
780
+ # Neither width nor height is provided, use resolution or scale.
781
+ render_resolution = resolution if resolution is not None else scale * 72
782
+ to_image_args["resolution"] = render_resolution
783
+ actual_scale_x = render_resolution / 72.0
784
+ actual_scale_y = render_resolution / 72.0
785
+ logger.debug(f"Rendering preview for page {page_index} with resolution={render_resolution} (scale: {actual_scale_x:.2f}).")
716
786
 
717
787
  try:
718
- # Get base image from pdfplumber using the Page object's underlying _page
719
- img_object = page._page.to_image(resolution=render_resolution, **kwargs)
720
- base_image = (
788
+ img_object = page_obj._page.to_image(**to_image_args)
789
+ base_image_pil = (
721
790
  img_object.annotated
722
791
  if hasattr(img_object, "annotated")
723
792
  else img_object._repr_png_()
724
793
  )
725
- if isinstance(base_image, bytes):
794
+ if isinstance(base_image_pil, bytes):
726
795
  from io import BytesIO
796
+ base_image_pil = Image.open(BytesIO(base_image_pil))
797
+ base_image_pil = base_image_pil.convert("RGB")
727
798
 
728
- base_image = Image.open(BytesIO(base_image))
729
- base_image = base_image.convert("RGB") # Ensure consistent format
799
+ # If scale was not determined by resolution, calculate it now from base_image_pil dimensions
800
+ if actual_scale_x is None or actual_scale_y is None:
801
+ if page_obj.width > 0:
802
+ actual_scale_x = base_image_pil.width / page_obj.width
803
+ else:
804
+ actual_scale_x = scale # Fallback to original scale
805
+ if page_obj.height > 0:
806
+ actual_scale_y = base_image_pil.height / page_obj.height
807
+ else:
808
+ actual_scale_y = scale # Fallback to original scale
809
+ logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})")
730
810
 
731
811
  # Convert temporary highlight dicts to Highlight objects
732
- # Note: Colors/labels should be determined *here* for temporary preview
733
812
  preview_highlights = []
734
813
  for hl_data in temporary_highlights:
735
- # Determine the final color using the service logic
736
814
  final_color = self._determine_highlight_color(
737
815
  color_input=hl_data.get("color"),
738
816
  label=hl_data.get("label"),
739
817
  use_color_cycling=hl_data.get("use_color_cycling", False),
740
818
  )
741
-
742
- # Extract potential attributes to draw
743
819
  attrs_to_draw = {}
744
820
  element = hl_data.get("element")
745
821
  include_attrs = hl_data.get("include_attrs")
@@ -753,25 +829,29 @@ class HighlightingService:
753
829
  logger.warning(
754
830
  f"Attribute '{attr_name}' not found on element {element}"
755
831
  )
756
-
757
- # Add highlight if geometry exists
758
832
  if hl_data.get("bbox") or hl_data.get("polygon"):
759
833
  preview_highlights.append(
760
834
  Highlight(
761
835
  page_index=hl_data["page_index"],
762
836
  bbox=hl_data.get("bbox"),
763
837
  polygon=hl_data.get("polygon"),
764
- color=final_color, # Use the determined color
838
+ color=final_color,
765
839
  label=hl_data.get("label"),
766
840
  attributes=attrs_to_draw,
767
841
  )
768
842
  )
769
-
770
- # Render only these highlights
771
- renderer = HighlightRenderer(page, base_image, preview_highlights, scale, render_ocr)
843
+
844
+ # Use the calculated actual_scale_x for the HighlightRenderer
845
+ # Assuming HighlightRenderer can handle a single scale or we adapt it.
846
+ # For now, pdfplumber usually maintains aspect ratio, so one scale should be okay.
847
+ # If not, HighlightRenderer needs to accept scale_x and scale_y.
848
+ # We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
849
+ # or if not, it's a reasonable approximation for highlight scaling.
850
+ renderer_scale = actual_scale_x
851
+
852
+ renderer = HighlightRenderer(page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr)
772
853
  rendered_image = renderer.render()
773
854
 
774
- # Create legend only from temporary highlights
775
855
  legend = None
776
856
  if labels:
777
857
  preview_labels = {h.label: h.color for h in preview_highlights if h.label}
@@ -781,7 +861,7 @@ class HighlightingService:
781
861
  rendered_image, legend, position=legend_position
782
862
  )
783
863
  else:
784
- final_image = rendered_image # No legend needed
864
+ final_image = rendered_image
785
865
  else:
786
866
  final_image = rendered_image
787
867
 
natural_pdf/core/page.py CHANGED
@@ -40,10 +40,10 @@ if TYPE_CHECKING:
40
40
  from natural_pdf.elements.base import Element
41
41
  from natural_pdf.elements.collections import ElementCollection
42
42
 
43
- # New Imports
43
+ # # New Imports
44
44
  import itertools
45
45
 
46
- # Deskew Imports (Conditional)
46
+ # # Deskew Imports (Conditional)
47
47
  import numpy as np
48
48
  from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
49
49
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -55,7 +55,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
55
55
  from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
56
56
  from natural_pdf.classification.manager import ClassificationManager # For type hint
57
57
 
58
- # --- Classification Imports --- #
58
+ # # --- Classification Imports --- #
59
59
  from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
60
60
  from natural_pdf.core.element_manager import ElementManager
61
61
  from natural_pdf.elements.base import Element # Import base element
@@ -66,7 +66,7 @@ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
66
66
  from natural_pdf.qa import DocumentQA, get_qa_engine
67
67
  from natural_pdf.utils.locks import pdf_render_lock # Import the lock
68
68
 
69
- # Import new utils
69
+ # # Import new utils
70
70
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
71
71
  from natural_pdf.widgets import InteractiveViewerWidget
72
72
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
@@ -210,7 +210,7 @@ class Page(ClassificationMixin, ExtractionMixin):
210
210
 
211
211
  def add_exclusion(
212
212
  self,
213
- exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
213
+ exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
214
214
  label: Optional[str] = None,
215
215
  ) -> "Page":
216
216
  """
@@ -274,7 +274,7 @@ class Page(ClassificationMixin, ExtractionMixin):
274
274
 
275
275
  return self
276
276
 
277
- def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
277
+ def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
278
278
  """
279
279
  Add a region to the page.
280
280
 
@@ -305,7 +305,7 @@ class Page(ClassificationMixin, ExtractionMixin):
305
305
 
306
306
  return self
307
307
 
308
- def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
308
+ def add_regions(self, regions: List["Region"], prefix: Optional[str] = None) -> "Page":
309
309
  """
310
310
  Add multiple regions to the page.
311
311
 
@@ -327,7 +327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
327
327
 
328
328
  return self
329
329
 
330
- def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
330
+ def _get_exclusion_regions(self, include_callable=True, debug=False) -> List["Region"]:
331
331
  """
332
332
  Get all exclusion regions for this page.
333
333
  Assumes self._exclusions contains tuples of (callable/Region, label).
@@ -1349,7 +1349,9 @@ class Page(ClassificationMixin, ExtractionMixin):
1349
1349
  self._highlighter.clear_page(self.index)
1350
1350
  return self
1351
1351
 
1352
- def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
1352
+ def analyze_text_styles(
1353
+ self, options: Optional[TextStyleOptions] = None
1354
+ ) -> "ElementCollection":
1353
1355
  """
1354
1356
  Analyze text elements by style, adding attributes directly to elements.
1355
1357
 
@@ -1520,7 +1522,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1520
1522
 
1521
1523
  def _create_text_elements_from_ocr(
1522
1524
  self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
1523
- ) -> List[TextElement]:
1525
+ ) -> List["TextElement"]:
1524
1526
  """DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
1525
1527
  logger.warning(
1526
1528
  "_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
@@ -1532,7 +1534,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1532
1534
  def apply_ocr(
1533
1535
  self,
1534
1536
  engine: Optional[str] = None,
1535
- options: Optional[OCROptions] = None,
1537
+ options: Optional["OCROptions"] = None,
1536
1538
  languages: Optional[List[str]] = None,
1537
1539
  min_confidence: Optional[float] = None,
1538
1540
  device: Optional[str] = None,
@@ -1597,12 +1599,12 @@ class Page(ClassificationMixin, ExtractionMixin):
1597
1599
  def extract_ocr_elements(
1598
1600
  self,
1599
1601
  engine: Optional[str] = None,
1600
- options: Optional[OCROptions] = None,
1602
+ options: Optional["OCROptions"] = None,
1601
1603
  languages: Optional[List[str]] = None,
1602
1604
  min_confidence: Optional[float] = None,
1603
1605
  device: Optional[str] = None,
1604
1606
  resolution: Optional[int] = None,
1605
- ) -> List[TextElement]:
1607
+ ) -> List["TextElement"]:
1606
1608
  """
1607
1609
  Extract text elements using OCR *without* adding them to the page's elements.
1608
1610
  Uses the shared OCRManager instance.
@@ -1716,7 +1718,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1716
1718
  return (self._page.width, self._page.height)
1717
1719
 
1718
1720
  @property
1719
- def layout_analyzer(self) -> LayoutAnalyzer:
1721
+ def layout_analyzer(self) -> "LayoutAnalyzer":
1720
1722
  """Get or create the layout analyzer for this page."""
1721
1723
  if self._layout_analyzer is None:
1722
1724
  if not self._layout_manager:
@@ -1728,7 +1730,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1728
1730
  def analyze_layout(
1729
1731
  self,
1730
1732
  engine: Optional[str] = None,
1731
- options: Optional[LayoutOptions] = None,
1733
+ options: Optional["LayoutOptions"] = None,
1732
1734
  confidence: Optional[float] = None,
1733
1735
  classes: Optional[List[str]] = None,
1734
1736
  exclude_classes: Optional[List[str]] = None,
@@ -1736,7 +1738,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1736
1738
  existing: str = "replace",
1737
1739
  model_name: Optional[str] = None,
1738
1740
  client: Optional[Any] = None, # Add client parameter
1739
- ) -> ElementCollection[Region]:
1741
+ ) -> "ElementCollection[Region]":
1740
1742
  """
1741
1743
  Analyze the page layout using the configured LayoutManager.
1742
1744
  Adds detected Region objects to the page's element manager.
@@ -1813,7 +1815,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1813
1815
 
1814
1816
  def get_section_between(
1815
1817
  self, start_element=None, end_element=None, boundary_inclusion="both"
1816
- ) -> Optional[Region]: # Return Optional
1818
+ ) -> Optional["Region"]: # Return Optional
1817
1819
  """
1818
1820
  Get a section between two elements on this page.
1819
1821
  """
@@ -2130,7 +2132,7 @@ class Page(ClassificationMixin, ExtractionMixin):
2130
2132
  if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
2131
2133
  logger.error(
2132
2134
  "Interactive viewer requires optional dependencies ('ipywidgets'). "
2133
- "Install with `pip install natural-pdf[interactive]`"
2135
+ "Install with `pip install natural-pdf[viewer]`"
2134
2136
  )
2135
2137
  # raise ImportError("ipywidgets not found.") # Option 1: Raise error
2136
2138
  return None # Option 2: Return None gracefully
natural_pdf/core/pdf.py CHANGED
@@ -61,6 +61,15 @@ except ImportError:
61
61
  )
62
62
 
63
63
 
64
+ try:
65
+ from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
66
+ except ImportError:
67
+ create_searchable_pdf = None
68
+ try:
69
+ from natural_pdf.exporters.original_pdf import create_original_pdf
70
+ except ImportError:
71
+ create_original_pdf = None
72
+
64
73
  logger = logging.getLogger("natural_pdf.core.pdf")
65
74
  tqdm = get_tqdm()
66
75
 
@@ -260,7 +269,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
260
269
  return self
261
270
 
262
271
  def add_exclusion(
263
- self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
272
+ self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
264
273
  ) -> "PDF":
265
274
  """
266
275
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
@@ -468,7 +477,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
468
477
  return self
469
478
 
470
479
  def add_region(
471
- self, region_func: Callable[["Page"], Optional[Region]], name: str = None
480
+ self, region_func: Callable[["Page"], Optional["Region"]], name: str = None
472
481
  ) -> "PDF":
473
482
  """
474
483
  Add a region function to the PDF.
@@ -769,23 +778,137 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
769
778
 
770
779
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
771
780
  """
781
+ DEPRECATED: Use save_pdf(..., ocr=True) instead.
772
782
  Saves the PDF with an OCR text layer, making content searchable.
773
783
 
774
- Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
784
+ Requires optional dependencies. Install with: pip install \"natural-pdf[ocr-export]\"
775
785
 
776
786
  Args:
777
787
  output_path: Path to save the searchable PDF
778
788
  dpi: Resolution for rendering and OCR overlay
779
789
  **kwargs: Additional keyword arguments passed to the exporter
780
- output_path: Path to save the searchable PDF
781
- dpi: Resolution for rendering and OCR overlay
782
- **kwargs: Additional keyword arguments passed to the exporter
783
790
  """
784
- from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
785
-
791
+ logger.warning(
792
+ "PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
793
+ )
794
+ if create_searchable_pdf is None:
795
+ raise ImportError(
796
+ "Saving searchable PDF requires 'pikepdf'. "
797
+ 'Install with: pip install "natural-pdf[ocr-export]"'
798
+ )
786
799
  output_path_str = str(output_path)
800
+ # Call the exporter directly, passing self (the PDF instance)
787
801
  create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
788
- logger.info(f"Searchable PDF saved to: {output_path_str}")
802
+ # Logger info is handled within the exporter now
803
+ # logger.info(f"Searchable PDF saved to: {output_path_str}")
804
+
805
+ def save_pdf(
806
+ self,
807
+ output_path: Union[str, Path],
808
+ ocr: bool = False,
809
+ original: bool = False,
810
+ dpi: int = 300,
811
+ ):
812
+ """
813
+ Saves the PDF object (all its pages) to a new file.
814
+
815
+ Choose one saving mode:
816
+ - `ocr=True`: Creates a new, image-based PDF using OCR results from all pages.
817
+ Text generated during the natural-pdf session becomes searchable,
818
+ but original vector content is lost. Requires 'ocr-export' extras.
819
+ - `original=True`: Saves a copy of the original PDF file this object represents.
820
+ Any OCR results or analyses from the natural-pdf session are NOT included.
821
+ If the PDF was opened from an in-memory buffer, this mode may not be suitable.
822
+ Requires 'ocr-export' extras.
823
+
824
+ Args:
825
+ output_path: Path to save the new PDF file.
826
+ ocr: If True, save as a searchable, image-based PDF using OCR data.
827
+ original: If True, save the original source PDF content.
828
+ dpi: Resolution (dots per inch) used only when ocr=True.
829
+
830
+ Raises:
831
+ ValueError: If the PDF has no pages, if neither or both 'ocr'
832
+ and 'original' are True.
833
+ ImportError: If required libraries are not installed for the chosen mode.
834
+ RuntimeError: If an unexpected error occurs during saving.
835
+ """
836
+ if not self.pages:
837
+ raise ValueError("Cannot save an empty PDF object.")
838
+
839
+ if not (ocr ^ original): # XOR: exactly one must be true
840
+ raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
841
+
842
+ output_path_obj = Path(output_path)
843
+ output_path_str = str(output_path_obj)
844
+
845
+ if ocr:
846
+ has_vector_elements = False
847
+ for page in self.pages:
848
+ if (
849
+ hasattr(page, "rects")
850
+ and page.rects
851
+ or hasattr(page, "lines")
852
+ and page.lines
853
+ or hasattr(page, "curves")
854
+ and page.curves
855
+ or (
856
+ hasattr(page, "chars")
857
+ and any(getattr(el, "source", None) != "ocr" for el in page.chars)
858
+ )
859
+ or (
860
+ hasattr(page, "words")
861
+ and any(getattr(el, "source", None) != "ocr" for el in page.words)
862
+ )
863
+ ):
864
+ has_vector_elements = True
865
+ break
866
+ if has_vector_elements:
867
+ logger.warning(
868
+ "Warning: Saving with ocr=True creates an image-based PDF. "
869
+ "Original vector elements (rects, lines, non-OCR text/chars) "
870
+ "will not be preserved in the output file."
871
+ )
872
+
873
+ logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
874
+ try:
875
+ # Delegate to the searchable PDF exporter, passing self (PDF instance)
876
+ create_searchable_pdf(self, output_path_str, dpi=dpi)
877
+ except Exception as e:
878
+ raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
879
+
880
+ elif original:
881
+ if create_original_pdf is None:
882
+ raise ImportError(
883
+ "Saving with original=True requires 'pikepdf'. "
884
+ 'Install with: pip install "natural-pdf[ocr-export]"'
885
+ )
886
+
887
+ # Optional: Add warning about losing OCR data similar to PageCollection
888
+ has_ocr_elements = False
889
+ for page in self.pages:
890
+ if hasattr(page, "find_all"):
891
+ ocr_text_elements = page.find_all("text[source=ocr]")
892
+ if ocr_text_elements:
893
+ has_ocr_elements = True
894
+ break
895
+ elif hasattr(page, "words"): # Fallback
896
+ if any(getattr(el, "source", None) == "ocr" for el in page.words):
897
+ has_ocr_elements = True
898
+ break
899
+ if has_ocr_elements:
900
+ logger.warning(
901
+ "Warning: Saving with original=True preserves original page content. "
902
+ "OCR text generated in this session will not be included in the saved file."
903
+ )
904
+
905
+ logger.info(f"Saving original PDF content to: {output_path_str}")
906
+ try:
907
+ # Delegate to the original PDF exporter, passing self (PDF instance)
908
+ create_original_pdf(self, output_path_str)
909
+ except Exception as e:
910
+ # Re-raise exception from exporter
911
+ raise e
789
912
 
790
913
  def ask(
791
914
  self,
@@ -850,9 +973,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
850
973
 
851
974
  def search_within_index(
852
975
  self,
853
- query: Union[str, Path, Image.Image, Region],
854
- search_service: SearchServiceProtocol,
855
- options: Optional[SearchOptions] = None,
976
+ query: Union[str, Path, Image.Image, "Region"],
977
+ search_service: "SearchServiceProtocol",
978
+ options: Optional["SearchOptions"] = None,
856
979
  ) -> List[Dict[str, Any]]:
857
980
  """
858
981
  Finds relevant documents from this PDF within a search index.
@@ -1109,6 +1232,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1109
1232
  """Context manager exit."""
1110
1233
  self.close()
1111
1234
 
1235
+ def __repr__(self) -> str:
1236
+ """Return a string representation of the PDF object."""
1237
+ if not hasattr(self, "_pages"):
1238
+ page_count_str = "uninitialized"
1239
+ else:
1240
+ page_count_str = str(len(self._pages))
1241
+
1242
+ source_info = getattr(self, "source_path", "unknown source")
1243
+ return f"<PDF source='{source_info}' pages={page_count_str}>"
1244
+
1112
1245
  def get_id(self) -> str:
1113
1246
  """Get unique identifier for this PDF."""
1114
1247
  """Get unique identifier for this PDF."""
@@ -1282,7 +1415,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1282
1415
  except ImportError:
1283
1416
  raise ImportError(
1284
1417
  "Classification dependencies missing. "
1285
- 'Install with: pip install "natural-pdf[classification]"'
1418
+ 'Install with: pip install "natural-pdf[core-ml]"'
1286
1419
  )
1287
1420
  raise ClassificationError("ClassificationManager not available.")
1288
1421