natural-pdf 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -40,10 +40,10 @@ if TYPE_CHECKING:
40
40
  from natural_pdf.elements.base import Element
41
41
  from natural_pdf.elements.collections import ElementCollection
42
42
 
43
- # New Imports
43
+ # # New Imports
44
44
  import itertools
45
45
 
46
- # Deskew Imports (Conditional)
46
+ # # Deskew Imports (Conditional)
47
47
  import numpy as np
48
48
  from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
49
49
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -55,7 +55,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
55
55
  from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
56
56
  from natural_pdf.classification.manager import ClassificationManager # For type hint
57
57
 
58
- # --- Classification Imports --- #
58
+ # # --- Classification Imports --- #
59
59
  from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
60
60
  from natural_pdf.core.element_manager import ElementManager
61
61
  from natural_pdf.elements.base import Element # Import base element
@@ -66,7 +66,7 @@ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
66
66
  from natural_pdf.qa import DocumentQA, get_qa_engine
67
67
  from natural_pdf.utils.locks import pdf_render_lock # Import the lock
68
68
 
69
- # Import new utils
69
+ # # Import new utils
70
70
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
71
71
  from natural_pdf.widgets import InteractiveViewerWidget
72
72
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
@@ -210,7 +210,7 @@ class Page(ClassificationMixin, ExtractionMixin):
210
210
 
211
211
  def add_exclusion(
212
212
  self,
213
- exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
213
+ exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
214
214
  label: Optional[str] = None,
215
215
  ) -> "Page":
216
216
  """
@@ -274,7 +274,7 @@ class Page(ClassificationMixin, ExtractionMixin):
274
274
 
275
275
  return self
276
276
 
277
- def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
277
+ def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
278
278
  """
279
279
  Add a region to the page.
280
280
 
@@ -305,7 +305,7 @@ class Page(ClassificationMixin, ExtractionMixin):
305
305
 
306
306
  return self
307
307
 
308
- def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
308
+ def add_regions(self, regions: List["Region"], prefix: Optional[str] = None) -> "Page":
309
309
  """
310
310
  Add multiple regions to the page.
311
311
 
@@ -327,7 +327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
327
327
 
328
328
  return self
329
329
 
330
- def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
330
+ def _get_exclusion_regions(self, include_callable=True, debug=False) -> List["Region"]:
331
331
  """
332
332
  Get all exclusion regions for this page.
333
333
  Assumes self._exclusions contains tuples of (callable/Region, label).
@@ -1349,7 +1349,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1349
1349
  self._highlighter.clear_page(self.index)
1350
1350
  return self
1351
1351
 
1352
- def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
1352
+ def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> "ElementCollection":
1353
1353
  """
1354
1354
  Analyze text elements by style, adding attributes directly to elements.
1355
1355
 
@@ -1520,7 +1520,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1520
1520
 
1521
1521
  def _create_text_elements_from_ocr(
1522
1522
  self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
1523
- ) -> List[TextElement]:
1523
+ ) -> List["TextElement"]:
1524
1524
  """DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
1525
1525
  logger.warning(
1526
1526
  "_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
@@ -1532,7 +1532,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1532
1532
  def apply_ocr(
1533
1533
  self,
1534
1534
  engine: Optional[str] = None,
1535
- options: Optional[OCROptions] = None,
1535
+ options: Optional["OCROptions"] = None,
1536
1536
  languages: Optional[List[str]] = None,
1537
1537
  min_confidence: Optional[float] = None,
1538
1538
  device: Optional[str] = None,
@@ -1597,12 +1597,12 @@ class Page(ClassificationMixin, ExtractionMixin):
1597
1597
  def extract_ocr_elements(
1598
1598
  self,
1599
1599
  engine: Optional[str] = None,
1600
- options: Optional[OCROptions] = None,
1600
+ options: Optional["OCROptions"] = None,
1601
1601
  languages: Optional[List[str]] = None,
1602
1602
  min_confidence: Optional[float] = None,
1603
1603
  device: Optional[str] = None,
1604
1604
  resolution: Optional[int] = None,
1605
- ) -> List[TextElement]:
1605
+ ) -> List["TextElement"]:
1606
1606
  """
1607
1607
  Extract text elements using OCR *without* adding them to the page's elements.
1608
1608
  Uses the shared OCRManager instance.
@@ -1716,7 +1716,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1716
1716
  return (self._page.width, self._page.height)
1717
1717
 
1718
1718
  @property
1719
- def layout_analyzer(self) -> LayoutAnalyzer:
1719
+ def layout_analyzer(self) -> "LayoutAnalyzer":
1720
1720
  """Get or create the layout analyzer for this page."""
1721
1721
  if self._layout_analyzer is None:
1722
1722
  if not self._layout_manager:
@@ -1728,7 +1728,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1728
1728
  def analyze_layout(
1729
1729
  self,
1730
1730
  engine: Optional[str] = None,
1731
- options: Optional[LayoutOptions] = None,
1731
+ options: Optional["LayoutOptions"] = None,
1732
1732
  confidence: Optional[float] = None,
1733
1733
  classes: Optional[List[str]] = None,
1734
1734
  exclude_classes: Optional[List[str]] = None,
@@ -1736,7 +1736,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1736
1736
  existing: str = "replace",
1737
1737
  model_name: Optional[str] = None,
1738
1738
  client: Optional[Any] = None, # Add client parameter
1739
- ) -> ElementCollection[Region]:
1739
+ ) -> "ElementCollection[Region]":
1740
1740
  """
1741
1741
  Analyze the page layout using the configured LayoutManager.
1742
1742
  Adds detected Region objects to the page's element manager.
@@ -1813,7 +1813,7 @@ class Page(ClassificationMixin, ExtractionMixin):
1813
1813
 
1814
1814
  def get_section_between(
1815
1815
  self, start_element=None, end_element=None, boundary_inclusion="both"
1816
- ) -> Optional[Region]: # Return Optional
1816
+ ) -> Optional["Region"]: # Return Optional
1817
1817
  """
1818
1818
  Get a section between two elements on this page.
1819
1819
  """
natural_pdf/core/pdf.py CHANGED
@@ -60,6 +60,14 @@ except ImportError:
60
60
  "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
61
61
  )
62
62
 
63
+ try:
64
+ from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
65
+ except ImportError:
66
+ create_searchable_pdf = None
67
+ try:
68
+ from natural_pdf.exporters.original_pdf import create_original_pdf
69
+ except ImportError:
70
+ create_original_pdf = None
63
71
 
64
72
  logger = logging.getLogger("natural_pdf.core.pdf")
65
73
  tqdm = get_tqdm()
@@ -84,7 +92,7 @@ except ImportError:
84
92
  # End Deskew Imports
85
93
 
86
94
 
87
- class PDF(ExtractionMixin, ExportMixin):
95
+ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
88
96
  """
89
97
  Enhanced PDF wrapper built on top of pdfplumber.
90
98
 
@@ -194,6 +202,7 @@ class PDF(ExtractionMixin, ExportMixin):
194
202
 
195
203
  self._initialize_managers()
196
204
  self._initialize_highlighter()
205
+ self.analyses: Dict[str, Any] = {}
197
206
 
198
207
  def _initialize_managers(self):
199
208
  """Initialize manager instances based on DEFAULT_MANAGERS."""
@@ -259,7 +268,7 @@ class PDF(ExtractionMixin, ExportMixin):
259
268
  return self
260
269
 
261
270
  def add_exclusion(
262
- self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
271
+ self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
263
272
  ) -> "PDF":
264
273
  """
265
274
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
@@ -467,7 +476,7 @@ class PDF(ExtractionMixin, ExportMixin):
467
476
  return self
468
477
 
469
478
  def add_region(
470
- self, region_func: Callable[["Page"], Optional[Region]], name: str = None
479
+ self, region_func: Callable[["Page"], Optional["Region"]], name: str = None
471
480
  ) -> "PDF":
472
481
  """
473
482
  Add a region function to the PDF.
@@ -768,23 +777,133 @@ class PDF(ExtractionMixin, ExportMixin):
768
777
 
769
778
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
770
779
  """
780
+ DEPRECATED: Use save_pdf(..., ocr=True) instead.
771
781
  Saves the PDF with an OCR text layer, making content searchable.
772
782
 
773
- Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
783
+ Requires optional dependencies. Install with: pip install \"natural-pdf[ocr-export]\"
774
784
 
775
785
  Args:
776
786
  output_path: Path to save the searchable PDF
777
787
  dpi: Resolution for rendering and OCR overlay
778
788
  **kwargs: Additional keyword arguments passed to the exporter
779
- output_path: Path to save the searchable PDF
780
- dpi: Resolution for rendering and OCR overlay
781
- **kwargs: Additional keyword arguments passed to the exporter
782
789
  """
783
- from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
784
-
790
+ logger.warning(
791
+ "PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
792
+ )
793
+ if create_searchable_pdf is None:
794
+ raise ImportError(
795
+ "Saving searchable PDF requires 'pikepdf' and 'Pillow'. "
796
+ "Install with: pip install \"natural-pdf[ocr-export]\""
797
+ )
785
798
  output_path_str = str(output_path)
799
+ # Call the exporter directly, passing self (the PDF instance)
786
800
  create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
787
- logger.info(f"Searchable PDF saved to: {output_path_str}")
801
+ # Logger info is handled within the exporter now
802
+ # logger.info(f"Searchable PDF saved to: {output_path_str}")
803
+
804
+ def save_pdf(
805
+ self,
806
+ output_path: Union[str, Path],
807
+ ocr: bool = False,
808
+ original: bool = False,
809
+ dpi: int = 300,
810
+ ):
811
+ """
812
+ Saves the PDF object (all its pages) to a new file.
813
+
814
+ Choose one saving mode:
815
+ - `ocr=True`: Creates a new, image-based PDF using OCR results from all pages.
816
+ Text generated during the natural-pdf session becomes searchable,
817
+ but original vector content is lost. Requires 'ocr-export' extras.
818
+ - `original=True`: Saves a copy of the original PDF file this object represents.
819
+ Any OCR results or analyses from the natural-pdf session are NOT included.
820
+ If the PDF was opened from an in-memory buffer, this mode may not be suitable.
821
+ Requires 'ocr-export' extras.
822
+
823
+ Args:
824
+ output_path: Path to save the new PDF file.
825
+ ocr: If True, save as a searchable, image-based PDF using OCR data.
826
+ original: If True, save the original source PDF content.
827
+ dpi: Resolution (dots per inch) used only when ocr=True.
828
+
829
+ Raises:
830
+ ValueError: If the PDF has no pages, if neither or both 'ocr'
831
+ and 'original' are True.
832
+ ImportError: If required libraries are not installed for the chosen mode.
833
+ RuntimeError: If an unexpected error occurs during saving.
834
+ """
835
+ if not self.pages:
836
+ raise ValueError("Cannot save an empty PDF object.")
837
+
838
+ if not (ocr ^ original): # XOR: exactly one must be true
839
+ raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
840
+
841
+ output_path_obj = Path(output_path)
842
+ output_path_str = str(output_path_obj)
843
+
844
+ if ocr:
845
+ if create_searchable_pdf is None:
846
+ raise ImportError(
847
+ "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
848
+ "Install with: pip install \"natural-pdf[ocr-export]\""
849
+ )
850
+
851
+ # Optional: Add warning about vector data loss similar to PageCollection
852
+ has_vector_elements = False
853
+ for page in self.pages:
854
+ if (hasattr(page, 'rects') and page.rects or
855
+ hasattr(page, 'lines') and page.lines or
856
+ hasattr(page, 'curves') and page.curves or
857
+ (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
858
+ (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
859
+ has_vector_elements = True
860
+ break
861
+ if has_vector_elements:
862
+ logger.warning(
863
+ "Warning: Saving with ocr=True creates an image-based PDF. "
864
+ "Original vector elements (rects, lines, non-OCR text/chars) "
865
+ "will not be preserved in the output file."
866
+ )
867
+
868
+ logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
869
+ try:
870
+ # Delegate to the searchable PDF exporter, passing self (PDF instance)
871
+ create_searchable_pdf(self, output_path_str, dpi=dpi)
872
+ except Exception as e:
873
+ raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
874
+
875
+ elif original:
876
+ if create_original_pdf is None:
877
+ raise ImportError(
878
+ "Saving with original=True requires 'pikepdf'. "
879
+ "Install with: pip install \"natural-pdf[ocr-export]\""
880
+ )
881
+
882
+ # Optional: Add warning about losing OCR data similar to PageCollection
883
+ has_ocr_elements = False
884
+ for page in self.pages:
885
+ if hasattr(page, 'find_all'):
886
+ ocr_text_elements = page.find_all("text[source=ocr]")
887
+ if ocr_text_elements:
888
+ has_ocr_elements = True
889
+ break
890
+ elif hasattr(page, 'words'): # Fallback
891
+ if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
892
+ has_ocr_elements = True
893
+ break
894
+ if has_ocr_elements:
895
+ logger.warning(
896
+ "Warning: Saving with original=True preserves original page content. "
897
+ "OCR text generated in this session will not be included in the saved file."
898
+ )
899
+
900
+ logger.info(f"Saving original PDF content to: {output_path_str}")
901
+ try:
902
+ # Delegate to the original PDF exporter, passing self (PDF instance)
903
+ create_original_pdf(self, output_path_str)
904
+ except Exception as e:
905
+ # Re-raise exception from exporter
906
+ raise e
788
907
 
789
908
  def ask(
790
909
  self,
@@ -849,9 +968,9 @@ class PDF(ExtractionMixin, ExportMixin):
849
968
 
850
969
  def search_within_index(
851
970
  self,
852
- query: Union[str, Path, Image.Image, Region],
853
- search_service: SearchServiceProtocol,
854
- options: Optional[SearchOptions] = None,
971
+ query: Union[str, Path, Image.Image, "Region"],
972
+ search_service: "SearchServiceProtocol",
973
+ options: Optional["SearchOptions"] = None,
855
974
  ) -> List[Dict[str, Any]]:
856
975
  """
857
976
  Finds relevant documents from this PDF within a search index.
@@ -1243,7 +1362,7 @@ class PDF(ExtractionMixin, ExportMixin):
1243
1362
 
1244
1363
  def classify_pages(
1245
1364
  self,
1246
- categories: List[str],
1365
+ labels: List[str],
1247
1366
  model: Optional[str] = None,
1248
1367
  pages: Optional[Union[Iterable[int], range, slice]] = None,
1249
1368
  analysis_key: str = "classification",
@@ -1254,7 +1373,7 @@ class PDF(ExtractionMixin, ExportMixin):
1254
1373
  Classifies specified pages of the PDF.
1255
1374
 
1256
1375
  Args:
1257
- categories: List of category names
1376
+ labels: List of category names
1258
1377
  model: Model identifier ('text', 'vision', or specific HF ID)
1259
1378
  pages: Page indices, slice, or None for all pages
1260
1379
  analysis_key: Key to store results in page's analyses dict
@@ -1264,8 +1383,8 @@ class PDF(ExtractionMixin, ExportMixin):
1264
1383
  Returns:
1265
1384
  Self for method chaining
1266
1385
  """
1267
- if not categories:
1268
- raise ValueError("Categories list cannot be empty.")
1386
+ if not labels:
1387
+ raise ValueError("Labels list cannot be empty.")
1269
1388
 
1270
1389
  try:
1271
1390
  manager = self.get_manager("classification")
@@ -1332,7 +1451,7 @@ class PDF(ExtractionMixin, ExportMixin):
1332
1451
  try:
1333
1452
  batch_results = manager.classify_batch(
1334
1453
  item_contents=page_contents,
1335
- categories=categories,
1454
+ labels=labels,
1336
1455
  model_id=model,
1337
1456
  using=inferred_using,
1338
1457
  **kwargs,
@@ -1537,3 +1656,58 @@ class PDF(ExtractionMixin, ExportMixin):
1537
1656
  raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1538
1657
  else:
1539
1658
  raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1659
+
1660
+ # --- Classification Mixin Implementation --- #
1661
+
1662
+ def _get_classification_manager(self) -> "ClassificationManager":
1663
+ """Returns the ClassificationManager instance for this PDF."""
1664
+ try:
1665
+ return self.get_manager("classification")
1666
+ except (KeyError, RuntimeError) as e:
1667
+ raise AttributeError(f"Could not retrieve ClassificationManager: {e}") from e
1668
+
1669
+ def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, Image.Image]:
1670
+ """
1671
+ Provides the content for classifying the entire PDF.
1672
+
1673
+ Args:
1674
+ model_type: 'text' or 'vision'.
1675
+ **kwargs: Additional arguments (e.g., for text extraction or image rendering).
1676
+
1677
+ Returns:
1678
+ Extracted text (str) or the first page's image (PIL.Image).
1679
+
1680
+ Raises:
1681
+ ValueError: If model_type is 'vision' and PDF has != 1 page,
1682
+ or if model_type is unsupported, or if content cannot be generated.
1683
+ """
1684
+ if model_type == "text":
1685
+ try:
1686
+ # Extract text from the whole document
1687
+ text = self.extract_text(**kwargs) # Pass relevant kwargs
1688
+ if not text or text.isspace():
1689
+ raise ValueError("PDF contains no extractable text for classification.")
1690
+ return text
1691
+ except Exception as e:
1692
+ logger.error(f"Error extracting text for PDF classification: {e}")
1693
+ raise ValueError("Failed to extract text for classification.") from e
1694
+
1695
+ elif model_type == "vision":
1696
+ if len(self.pages) == 1:
1697
+ # Use the single page's content method
1698
+ try:
1699
+ return self.pages[0]._get_classification_content(model_type="vision", **kwargs)
1700
+ except Exception as e:
1701
+ logger.error(f"Error getting image from single page for classification: {e}")
1702
+ raise ValueError("Failed to get image from single page.") from e
1703
+ elif len(self.pages) == 0:
1704
+ raise ValueError("Cannot classify empty PDF using vision model.")
1705
+ else:
1706
+ raise ValueError(
1707
+ f"Vision classification for a PDF object is only supported for single-page PDFs. "
1708
+ f"This PDF has {len(self.pages)} pages. Use pdf.pages[0].classify() or pdf.classify_pages()."
1709
+ )
1710
+ else:
1711
+ raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
1712
+
1713
+ # --- End Classification Mixin Implementation ---