natural-pdf 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/__init__.py CHANGED
@@ -37,72 +37,56 @@ def configure_logging(level=logging.INFO, handler=None):
37
37
  logger.propagate = False
38
38
 
39
39
 
40
+ # Version
41
+ __version__ = "0.1.1"
42
+
43
+ # Core imports
44
+ from natural_pdf.collections.pdf_collection import PDFCollection
40
45
  from natural_pdf.core.page import Page
41
46
  from natural_pdf.core.pdf import PDF
42
47
  from natural_pdf.elements.collections import ElementCollection
43
48
  from natural_pdf.elements.region import Region
44
49
 
45
- # Import QA module if available
46
- try:
47
- from natural_pdf.qa import DocumentQA, get_qa_engine
48
-
49
- HAS_QA = True
50
- except ImportError:
51
- HAS_QA = False
52
-
53
- __version__ = "0.1.1"
54
-
55
- __all__ = [
56
- "PDF",
57
- "PDFCollection",
58
- "Page",
59
- "Region",
60
- "ElementCollection",
61
- "TextSearchOptions",
62
- "MultiModalSearchOptions",
63
- "BaseSearchOptions",
64
- "configure_logging",
65
- ]
66
-
67
- if HAS_QA:
68
- __all__.extend(["DocumentQA", "get_qa_engine"])
69
-
70
-
71
- from .collections.pdf_collection import PDFCollection
72
-
73
- # Core classes
74
- from .core.pdf import PDF
75
- from .elements.region import Region
50
+ ElementCollection = None
76
51
 
77
52
  # Search options (if extras installed)
78
53
  try:
79
- from .search.search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
54
+ from natural_pdf.search.search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
80
55
  except ImportError:
81
56
  # Define dummy classes if extras not installed, so imports don't break
82
57
  # but using them will raise the ImportError from check_haystack_availability
83
- class TextSearchOptions:
58
+ class BaseSearchOptions:
84
59
  def __init__(self, *args, **kwargs):
85
60
  pass
86
61
 
87
- class MultiModalSearchOptions:
62
+ class TextSearchOptions:
88
63
  def __init__(self, *args, **kwargs):
89
64
  pass
90
65
 
91
- class BaseSearchOptions:
66
+ class MultiModalSearchOptions:
92
67
  def __init__(self, *args, **kwargs):
93
68
  pass
94
69
 
95
-
96
- # Expose logging setup? (Optional)
97
- # from . import logging_config
98
- # logging_config.setup_logging()
70
+ # Import QA module if available
71
+ try:
72
+ from natural_pdf.qa import DocumentQA, get_qa_engine
73
+ HAS_QA = True
74
+ except ImportError:
75
+ HAS_QA = False
99
76
 
100
77
  # Explicitly define what gets imported with 'from natural_pdf import *'
101
78
  __all__ = [
102
79
  "PDF",
103
80
  "PDFCollection",
81
+ "Page",
104
82
  "Region",
105
- "TextSearchOptions", # Include search options
83
+ "ElementCollection",
84
+ "TextSearchOptions",
106
85
  "MultiModalSearchOptions",
107
86
  "BaseSearchOptions",
87
+ "configure_logging",
108
88
  ]
89
+
90
+ # Add QA components to __all__ if available
91
+ if HAS_QA:
92
+ __all__.extend(["DocumentQA", "get_qa_engine"])
natural_pdf/core/page.py CHANGED
@@ -40,10 +40,10 @@ if TYPE_CHECKING:
40
40
  from natural_pdf.elements.base import Element
41
41
  from natural_pdf.elements.collections import ElementCollection
42
42
 
43
- # New Imports
43
+ # # New Imports
44
44
  import itertools
45
45
 
46
- # Deskew Imports (Conditional)
46
+ # # Deskew Imports (Conditional)
47
47
  import numpy as np
48
48
  from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
49
49
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -55,7 +55,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
55
55
  from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
56
56
  from natural_pdf.classification.manager import ClassificationManager # For type hint
57
57
 
58
- # --- Classification Imports --- #
58
+ # # --- Classification Imports --- #
59
59
  from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
60
60
  from natural_pdf.core.element_manager import ElementManager
61
61
  from natural_pdf.elements.base import Element # Import base element
@@ -66,7 +66,7 @@ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
66
66
  from natural_pdf.qa import DocumentQA, get_qa_engine
67
67
  from natural_pdf.utils.locks import pdf_render_lock # Import the lock
68
68
 
69
- # Import new utils
69
+ # # Import new utils
70
70
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
71
71
  from natural_pdf.widgets import InteractiveViewerWidget
72
72
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
@@ -210,7 +210,7 @@ class Page(ClassificationMixin, ExtractionMixin):
210
210
 
211
211
  def add_exclusion(
212
212
  self,
213
- exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
213
+ exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
214
214
  label: Optional[str] = None,
215
215
  ) -> "Page":
216
216
  """
@@ -274,7 +274,7 @@ class Page(ClassificationMixin, ExtractionMixin):
274
274
 
275
275
  return self
276
276
 
277
- def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
277
+ def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
278
278
  """
279
279
  Add a region to the page.
280
280
 
@@ -305,7 +305,7 @@ class Page(ClassificationMixin, ExtractionMixin):
305
305
 
306
306
  return self
307
307
 
308
- def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
308
+ def add_regions(self, regions: List["Region"], prefix: Optional[str] = None) -> "Page":
309
309
  """
310
310
  Add multiple regions to the page.
311
311
 
@@ -327,7 +327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
327
327
 
328
328
  return self
329
329
 
330
- def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
330
+ def _get_exclusion_regions(self, include_callable=True, debug=False) -> List["Region"]:
331
331
  """
332
332
  Get all exclusion regions for this page.
333
333
  Assumes self._exclusions contains tuples of (callable/Region, label).
@@ -1349,7 +1349,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1349
1349
  self._highlighter.clear_page(self.index)
1350
1350
  return self
1351
1351
 
1352
- def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
1352
+ def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> "ElementCollection":
1353
1353
  """
1354
1354
  Analyze text elements by style, adding attributes directly to elements.
1355
1355
 
@@ -1520,7 +1520,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1520
1520
 
1521
1521
  def _create_text_elements_from_ocr(
1522
1522
  self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
1523
- ) -> List[TextElement]:
1523
+ ) -> List["TextElement"]:
1524
1524
  """DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
1525
1525
  logger.warning(
1526
1526
  "_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
@@ -1532,7 +1532,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1532
1532
  def apply_ocr(
1533
1533
  self,
1534
1534
  engine: Optional[str] = None,
1535
- options: Optional[OCROptions] = None,
1535
+ options: Optional["OCROptions"] = None,
1536
1536
  languages: Optional[List[str]] = None,
1537
1537
  min_confidence: Optional[float] = None,
1538
1538
  device: Optional[str] = None,
@@ -1597,12 +1597,12 @@ class Page(ClassificationMixin, ExtractionMixin):
1597
1597
  def extract_ocr_elements(
1598
1598
  self,
1599
1599
  engine: Optional[str] = None,
1600
- options: Optional[OCROptions] = None,
1600
+ options: Optional["OCROptions"] = None,
1601
1601
  languages: Optional[List[str]] = None,
1602
1602
  min_confidence: Optional[float] = None,
1603
1603
  device: Optional[str] = None,
1604
1604
  resolution: Optional[int] = None,
1605
- ) -> List[TextElement]:
1605
+ ) -> List["TextElement"]:
1606
1606
  """
1607
1607
  Extract text elements using OCR *without* adding them to the page's elements.
1608
1608
  Uses the shared OCRManager instance.
@@ -1716,7 +1716,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1716
1716
  return (self._page.width, self._page.height)
1717
1717
 
1718
1718
  @property
1719
- def layout_analyzer(self) -> LayoutAnalyzer:
1719
+ def layout_analyzer(self) -> "LayoutAnalyzer":
1720
1720
  """Get or create the layout analyzer for this page."""
1721
1721
  if self._layout_analyzer is None:
1722
1722
  if not self._layout_manager:
@@ -1728,7 +1728,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1728
1728
  def analyze_layout(
1729
1729
  self,
1730
1730
  engine: Optional[str] = None,
1731
- options: Optional[LayoutOptions] = None,
1731
+ options: Optional["LayoutOptions"] = None,
1732
1732
  confidence: Optional[float] = None,
1733
1733
  classes: Optional[List[str]] = None,
1734
1734
  exclude_classes: Optional[List[str]] = None,
@@ -1736,7 +1736,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1736
1736
  existing: str = "replace",
1737
1737
  model_name: Optional[str] = None,
1738
1738
  client: Optional[Any] = None, # Add client parameter
1739
- ) -> ElementCollection[Region]:
1739
+ ) -> "ElementCollection[Region]":
1740
1740
  """
1741
1741
  Analyze the page layout using the configured LayoutManager.
1742
1742
  Adds detected Region objects to the page's element manager.
@@ -1813,7 +1813,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1813
1813
 
1814
1814
  def get_section_between(
1815
1815
  self, start_element=None, end_element=None, boundary_inclusion="both"
1816
- ) -> Optional[Region]: # Return Optional
1816
+ ) -> Optional["Region"]: # Return Optional
1817
1817
  """
1818
1818
  Get a section between two elements on this page.
1819
1819
  """
natural_pdf/core/pdf.py CHANGED
@@ -60,6 +60,14 @@ except ImportError:
60
60
  "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
61
61
  )
62
62
 
63
+ try:
64
+ from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
65
+ except ImportError:
66
+ create_searchable_pdf = None
67
+ try:
68
+ from natural_pdf.exporters.original_pdf import create_original_pdf
69
+ except ImportError:
70
+ create_original_pdf = None
63
71
 
64
72
  logger = logging.getLogger("natural_pdf.core.pdf")
65
73
  tqdm = get_tqdm()
@@ -260,7 +268,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
260
268
  return self
261
269
 
262
270
  def add_exclusion(
263
- self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
271
+ self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
264
272
  ) -> "PDF":
265
273
  """
266
274
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
@@ -468,7 +476,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
468
476
  return self
469
477
 
470
478
  def add_region(
471
- self, region_func: Callable[["Page"], Optional[Region]], name: str = None
479
+ self, region_func: Callable[["Page"], Optional["Region"]], name: str = None
472
480
  ) -> "PDF":
473
481
  """
474
482
  Add a region function to the PDF.
@@ -769,23 +777,133 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
769
777
 
770
778
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
771
779
  """
780
+ DEPRECATED: Use save_pdf(..., ocr=True) instead.
772
781
  Saves the PDF with an OCR text layer, making content searchable.
773
782
 
774
- Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
783
+ Requires optional dependencies. Install with: pip install \"natural-pdf[ocr-export]\"
775
784
 
776
785
  Args:
777
786
  output_path: Path to save the searchable PDF
778
787
  dpi: Resolution for rendering and OCR overlay
779
788
  **kwargs: Additional keyword arguments passed to the exporter
780
- output_path: Path to save the searchable PDF
781
- dpi: Resolution for rendering and OCR overlay
782
- **kwargs: Additional keyword arguments passed to the exporter
783
789
  """
784
- from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
785
-
790
+ logger.warning(
791
+ "PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
792
+ )
793
+ if create_searchable_pdf is None:
794
+ raise ImportError(
795
+ "Saving searchable PDF requires 'pikepdf' and 'Pillow'. "
796
+ "Install with: pip install \"natural-pdf[ocr-export]\""
797
+ )
786
798
  output_path_str = str(output_path)
799
+ # Call the exporter directly, passing self (the PDF instance)
787
800
  create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
788
- logger.info(f"Searchable PDF saved to: {output_path_str}")
801
+ # Logger info is handled within the exporter now
802
+ # logger.info(f"Searchable PDF saved to: {output_path_str}")
803
+
804
+ def save_pdf(
805
+ self,
806
+ output_path: Union[str, Path],
807
+ ocr: bool = False,
808
+ original: bool = False,
809
+ dpi: int = 300,
810
+ ):
811
+ """
812
+ Saves the PDF object (all its pages) to a new file.
813
+
814
+ Choose one saving mode:
815
+ - `ocr=True`: Creates a new, image-based PDF using OCR results from all pages.
816
+ Text generated during the natural-pdf session becomes searchable,
817
+ but original vector content is lost. Requires 'ocr-export' extras.
818
+ - `original=True`: Saves a copy of the original PDF file this object represents.
819
+ Any OCR results or analyses from the natural-pdf session are NOT included.
820
+ If the PDF was opened from an in-memory buffer, this mode may not be suitable.
821
+ Requires 'ocr-export' extras.
822
+
823
+ Args:
824
+ output_path: Path to save the new PDF file.
825
+ ocr: If True, save as a searchable, image-based PDF using OCR data.
826
+ original: If True, save the original source PDF content.
827
+ dpi: Resolution (dots per inch) used only when ocr=True.
828
+
829
+ Raises:
830
+ ValueError: If the PDF has no pages, if neither or both 'ocr'
831
+ and 'original' are True.
832
+ ImportError: If required libraries are not installed for the chosen mode.
833
+ RuntimeError: If an unexpected error occurs during saving.
834
+ """
835
+ if not self.pages:
836
+ raise ValueError("Cannot save an empty PDF object.")
837
+
838
+ if not (ocr ^ original): # XOR: exactly one must be true
839
+ raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
840
+
841
+ output_path_obj = Path(output_path)
842
+ output_path_str = str(output_path_obj)
843
+
844
+ if ocr:
845
+ if create_searchable_pdf is None:
846
+ raise ImportError(
847
+ "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
848
+ "Install with: pip install \"natural-pdf[ocr-export]\""
849
+ )
850
+
851
+ # Optional: Add warning about vector data loss similar to PageCollection
852
+ has_vector_elements = False
853
+ for page in self.pages:
854
+ if (hasattr(page, 'rects') and page.rects or
855
+ hasattr(page, 'lines') and page.lines or
856
+ hasattr(page, 'curves') and page.curves or
857
+ (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
858
+ (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
859
+ has_vector_elements = True
860
+ break
861
+ if has_vector_elements:
862
+ logger.warning(
863
+ "Warning: Saving with ocr=True creates an image-based PDF. "
864
+ "Original vector elements (rects, lines, non-OCR text/chars) "
865
+ "will not be preserved in the output file."
866
+ )
867
+
868
+ logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
869
+ try:
870
+ # Delegate to the searchable PDF exporter, passing self (PDF instance)
871
+ create_searchable_pdf(self, output_path_str, dpi=dpi)
872
+ except Exception as e:
873
+ raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
874
+
875
+ elif original:
876
+ if create_original_pdf is None:
877
+ raise ImportError(
878
+ "Saving with original=True requires 'pikepdf'. "
879
+ "Install with: pip install \"natural-pdf[ocr-export]\""
880
+ )
881
+
882
+ # Optional: Add warning about losing OCR data similar to PageCollection
883
+ has_ocr_elements = False
884
+ for page in self.pages:
885
+ if hasattr(page, 'find_all'):
886
+ ocr_text_elements = page.find_all("text[source=ocr]")
887
+ if ocr_text_elements:
888
+ has_ocr_elements = True
889
+ break
890
+ elif hasattr(page, 'words'): # Fallback
891
+ if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
892
+ has_ocr_elements = True
893
+ break
894
+ if has_ocr_elements:
895
+ logger.warning(
896
+ "Warning: Saving with original=True preserves original page content. "
897
+ "OCR text generated in this session will not be included in the saved file."
898
+ )
899
+
900
+ logger.info(f"Saving original PDF content to: {output_path_str}")
901
+ try:
902
+ # Delegate to the original PDF exporter, passing self (PDF instance)
903
+ create_original_pdf(self, output_path_str)
904
+ except Exception as e:
905
+ # Re-raise exception from exporter
906
+ raise e
789
907
 
790
908
  def ask(
791
909
  self,
@@ -850,9 +968,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
850
968
 
851
969
  def search_within_index(
852
970
  self,
853
- query: Union[str, Path, Image.Image, Region],
854
- search_service: SearchServiceProtocol,
855
- options: Optional[SearchOptions] = None,
971
+ query: Union[str, Path, Image.Image, "Region"],
972
+ search_service: "SearchServiceProtocol",
973
+ options: Optional["SearchOptions"] = None,
856
974
  ) -> List[Dict[str, Any]]:
857
975
  """
858
976
  Finds relevant documents from this PDF within a search index.