natural-pdf 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +24 -40
- natural_pdf/classification/manager.py +26 -22
- natural_pdf/classification/mixin.py +7 -7
- natural_pdf/classification/results.py +17 -9
- natural_pdf/collections/mixins.py +17 -0
- natural_pdf/collections/pdf_collection.py +78 -46
- natural_pdf/core/page.py +17 -17
- natural_pdf/core/pdf.py +192 -18
- natural_pdf/elements/collections.py +307 -3
- natural_pdf/elements/region.py +2 -3
- natural_pdf/exporters/hocr.py +540 -0
- natural_pdf/exporters/hocr_font.py +142 -0
- natural_pdf/exporters/original_pdf.py +130 -0
- natural_pdf/exporters/searchable_pdf.py +3 -3
- natural_pdf/ocr/engine_surya.py +1 -1
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/METADATA +1 -2
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/RECORD +20 -17
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -40,10 +40,10 @@ if TYPE_CHECKING:
|
|
40
40
|
from natural_pdf.elements.base import Element
|
41
41
|
from natural_pdf.elements.collections import ElementCollection
|
42
42
|
|
43
|
-
# New Imports
|
43
|
+
# # New Imports
|
44
44
|
import itertools
|
45
45
|
|
46
|
-
# Deskew Imports (Conditional)
|
46
|
+
# # Deskew Imports (Conditional)
|
47
47
|
import numpy as np
|
48
48
|
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
49
49
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
@@ -55,7 +55,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
|
|
55
55
|
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
56
56
|
from natural_pdf.classification.manager import ClassificationManager # For type hint
|
57
57
|
|
58
|
-
# --- Classification Imports --- #
|
58
|
+
# # --- Classification Imports --- #
|
59
59
|
from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
|
60
60
|
from natural_pdf.core.element_manager import ElementManager
|
61
61
|
from natural_pdf.elements.base import Element # Import base element
|
@@ -66,7 +66,7 @@ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
|
66
66
|
from natural_pdf.qa import DocumentQA, get_qa_engine
|
67
67
|
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
68
68
|
|
69
|
-
# Import new utils
|
69
|
+
# # Import new utils
|
70
70
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
71
71
|
from natural_pdf.widgets import InteractiveViewerWidget
|
72
72
|
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
|
@@ -210,7 +210,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
210
210
|
|
211
211
|
def add_exclusion(
|
212
212
|
self,
|
213
|
-
exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
|
213
|
+
exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
|
214
214
|
label: Optional[str] = None,
|
215
215
|
) -> "Page":
|
216
216
|
"""
|
@@ -274,7 +274,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
274
274
|
|
275
275
|
return self
|
276
276
|
|
277
|
-
def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
|
277
|
+
def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
|
278
278
|
"""
|
279
279
|
Add a region to the page.
|
280
280
|
|
@@ -305,7 +305,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
305
305
|
|
306
306
|
return self
|
307
307
|
|
308
|
-
def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
|
308
|
+
def add_regions(self, regions: List["Region"], prefix: Optional[str] = None) -> "Page":
|
309
309
|
"""
|
310
310
|
Add multiple regions to the page.
|
311
311
|
|
@@ -327,7 +327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
327
327
|
|
328
328
|
return self
|
329
329
|
|
330
|
-
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
|
330
|
+
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List["Region"]:
|
331
331
|
"""
|
332
332
|
Get all exclusion regions for this page.
|
333
333
|
Assumes self._exclusions contains tuples of (callable/Region, label).
|
@@ -1349,7 +1349,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1349
1349
|
self._highlighter.clear_page(self.index)
|
1350
1350
|
return self
|
1351
1351
|
|
1352
|
-
def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
|
1352
|
+
def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> "ElementCollection":
|
1353
1353
|
"""
|
1354
1354
|
Analyze text elements by style, adding attributes directly to elements.
|
1355
1355
|
|
@@ -1520,7 +1520,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1520
1520
|
|
1521
1521
|
def _create_text_elements_from_ocr(
|
1522
1522
|
self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
|
1523
|
-
) -> List[TextElement]:
|
1523
|
+
) -> List["TextElement"]:
|
1524
1524
|
"""DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
|
1525
1525
|
logger.warning(
|
1526
1526
|
"_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
|
@@ -1532,7 +1532,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1532
1532
|
def apply_ocr(
|
1533
1533
|
self,
|
1534
1534
|
engine: Optional[str] = None,
|
1535
|
-
options: Optional[OCROptions] = None,
|
1535
|
+
options: Optional["OCROptions"] = None,
|
1536
1536
|
languages: Optional[List[str]] = None,
|
1537
1537
|
min_confidence: Optional[float] = None,
|
1538
1538
|
device: Optional[str] = None,
|
@@ -1597,12 +1597,12 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1597
1597
|
def extract_ocr_elements(
|
1598
1598
|
self,
|
1599
1599
|
engine: Optional[str] = None,
|
1600
|
-
options: Optional[OCROptions] = None,
|
1600
|
+
options: Optional["OCROptions"] = None,
|
1601
1601
|
languages: Optional[List[str]] = None,
|
1602
1602
|
min_confidence: Optional[float] = None,
|
1603
1603
|
device: Optional[str] = None,
|
1604
1604
|
resolution: Optional[int] = None,
|
1605
|
-
) -> List[TextElement]:
|
1605
|
+
) -> List["TextElement"]:
|
1606
1606
|
"""
|
1607
1607
|
Extract text elements using OCR *without* adding them to the page's elements.
|
1608
1608
|
Uses the shared OCRManager instance.
|
@@ -1716,7 +1716,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1716
1716
|
return (self._page.width, self._page.height)
|
1717
1717
|
|
1718
1718
|
@property
|
1719
|
-
def layout_analyzer(self) -> LayoutAnalyzer:
|
1719
|
+
def layout_analyzer(self) -> "LayoutAnalyzer":
|
1720
1720
|
"""Get or create the layout analyzer for this page."""
|
1721
1721
|
if self._layout_analyzer is None:
|
1722
1722
|
if not self._layout_manager:
|
@@ -1728,7 +1728,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1728
1728
|
def analyze_layout(
|
1729
1729
|
self,
|
1730
1730
|
engine: Optional[str] = None,
|
1731
|
-
options: Optional[LayoutOptions] = None,
|
1731
|
+
options: Optional["LayoutOptions"] = None,
|
1732
1732
|
confidence: Optional[float] = None,
|
1733
1733
|
classes: Optional[List[str]] = None,
|
1734
1734
|
exclude_classes: Optional[List[str]] = None,
|
@@ -1736,7 +1736,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1736
1736
|
existing: str = "replace",
|
1737
1737
|
model_name: Optional[str] = None,
|
1738
1738
|
client: Optional[Any] = None, # Add client parameter
|
1739
|
-
) -> ElementCollection[Region]:
|
1739
|
+
) -> "ElementCollection[Region]":
|
1740
1740
|
"""
|
1741
1741
|
Analyze the page layout using the configured LayoutManager.
|
1742
1742
|
Adds detected Region objects to the page's element manager.
|
@@ -1813,7 +1813,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1813
1813
|
|
1814
1814
|
def get_section_between(
|
1815
1815
|
self, start_element=None, end_element=None, boundary_inclusion="both"
|
1816
|
-
) -> Optional[Region]: # Return Optional
|
1816
|
+
) -> Optional["Region"]: # Return Optional
|
1817
1817
|
"""
|
1818
1818
|
Get a section between two elements on this page.
|
1819
1819
|
"""
|
natural_pdf/core/pdf.py
CHANGED
@@ -60,6 +60,14 @@ except ImportError:
|
|
60
60
|
"Search dependencies are not installed. Install with: pip install natural-pdf[search]"
|
61
61
|
)
|
62
62
|
|
63
|
+
try:
|
64
|
+
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
65
|
+
except ImportError:
|
66
|
+
create_searchable_pdf = None
|
67
|
+
try:
|
68
|
+
from natural_pdf.exporters.original_pdf import create_original_pdf
|
69
|
+
except ImportError:
|
70
|
+
create_original_pdf = None
|
63
71
|
|
64
72
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
65
73
|
tqdm = get_tqdm()
|
@@ -84,7 +92,7 @@ except ImportError:
|
|
84
92
|
# End Deskew Imports
|
85
93
|
|
86
94
|
|
87
|
-
class PDF(ExtractionMixin, ExportMixin):
|
95
|
+
class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
88
96
|
"""
|
89
97
|
Enhanced PDF wrapper built on top of pdfplumber.
|
90
98
|
|
@@ -194,6 +202,7 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
194
202
|
|
195
203
|
self._initialize_managers()
|
196
204
|
self._initialize_highlighter()
|
205
|
+
self.analyses: Dict[str, Any] = {}
|
197
206
|
|
198
207
|
def _initialize_managers(self):
|
199
208
|
"""Initialize manager instances based on DEFAULT_MANAGERS."""
|
@@ -259,7 +268,7 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
259
268
|
return self
|
260
269
|
|
261
270
|
def add_exclusion(
|
262
|
-
self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
|
271
|
+
self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
|
263
272
|
) -> "PDF":
|
264
273
|
"""
|
265
274
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
@@ -467,7 +476,7 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
467
476
|
return self
|
468
477
|
|
469
478
|
def add_region(
|
470
|
-
self, region_func: Callable[["Page"], Optional[Region]], name: str = None
|
479
|
+
self, region_func: Callable[["Page"], Optional["Region"]], name: str = None
|
471
480
|
) -> "PDF":
|
472
481
|
"""
|
473
482
|
Add a region function to the PDF.
|
@@ -768,23 +777,133 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
768
777
|
|
769
778
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
770
779
|
"""
|
780
|
+
DEPRECATED: Use save_pdf(..., ocr=True) instead.
|
771
781
|
Saves the PDF with an OCR text layer, making content searchable.
|
772
782
|
|
773
|
-
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-
|
783
|
+
Requires optional dependencies. Install with: pip install \"natural-pdf[ocr-export]\"
|
774
784
|
|
775
785
|
Args:
|
776
786
|
output_path: Path to save the searchable PDF
|
777
787
|
dpi: Resolution for rendering and OCR overlay
|
778
788
|
**kwargs: Additional keyword arguments passed to the exporter
|
779
|
-
output_path: Path to save the searchable PDF
|
780
|
-
dpi: Resolution for rendering and OCR overlay
|
781
|
-
**kwargs: Additional keyword arguments passed to the exporter
|
782
789
|
"""
|
783
|
-
|
784
|
-
|
790
|
+
logger.warning(
|
791
|
+
"PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
|
792
|
+
)
|
793
|
+
if create_searchable_pdf is None:
|
794
|
+
raise ImportError(
|
795
|
+
"Saving searchable PDF requires 'pikepdf' and 'Pillow'. "
|
796
|
+
"Install with: pip install \"natural-pdf[ocr-export]\""
|
797
|
+
)
|
785
798
|
output_path_str = str(output_path)
|
799
|
+
# Call the exporter directly, passing self (the PDF instance)
|
786
800
|
create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
|
787
|
-
|
801
|
+
# Logger info is handled within the exporter now
|
802
|
+
# logger.info(f"Searchable PDF saved to: {output_path_str}")
|
803
|
+
|
804
|
+
def save_pdf(
|
805
|
+
self,
|
806
|
+
output_path: Union[str, Path],
|
807
|
+
ocr: bool = False,
|
808
|
+
original: bool = False,
|
809
|
+
dpi: int = 300,
|
810
|
+
):
|
811
|
+
"""
|
812
|
+
Saves the PDF object (all its pages) to a new file.
|
813
|
+
|
814
|
+
Choose one saving mode:
|
815
|
+
- `ocr=True`: Creates a new, image-based PDF using OCR results from all pages.
|
816
|
+
Text generated during the natural-pdf session becomes searchable,
|
817
|
+
but original vector content is lost. Requires 'ocr-export' extras.
|
818
|
+
- `original=True`: Saves a copy of the original PDF file this object represents.
|
819
|
+
Any OCR results or analyses from the natural-pdf session are NOT included.
|
820
|
+
If the PDF was opened from an in-memory buffer, this mode may not be suitable.
|
821
|
+
Requires 'ocr-export' extras.
|
822
|
+
|
823
|
+
Args:
|
824
|
+
output_path: Path to save the new PDF file.
|
825
|
+
ocr: If True, save as a searchable, image-based PDF using OCR data.
|
826
|
+
original: If True, save the original source PDF content.
|
827
|
+
dpi: Resolution (dots per inch) used only when ocr=True.
|
828
|
+
|
829
|
+
Raises:
|
830
|
+
ValueError: If the PDF has no pages, if neither or both 'ocr'
|
831
|
+
and 'original' are True.
|
832
|
+
ImportError: If required libraries are not installed for the chosen mode.
|
833
|
+
RuntimeError: If an unexpected error occurs during saving.
|
834
|
+
"""
|
835
|
+
if not self.pages:
|
836
|
+
raise ValueError("Cannot save an empty PDF object.")
|
837
|
+
|
838
|
+
if not (ocr ^ original): # XOR: exactly one must be true
|
839
|
+
raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
|
840
|
+
|
841
|
+
output_path_obj = Path(output_path)
|
842
|
+
output_path_str = str(output_path_obj)
|
843
|
+
|
844
|
+
if ocr:
|
845
|
+
if create_searchable_pdf is None:
|
846
|
+
raise ImportError(
|
847
|
+
"Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
|
848
|
+
"Install with: pip install \"natural-pdf[ocr-export]\""
|
849
|
+
)
|
850
|
+
|
851
|
+
# Optional: Add warning about vector data loss similar to PageCollection
|
852
|
+
has_vector_elements = False
|
853
|
+
for page in self.pages:
|
854
|
+
if (hasattr(page, 'rects') and page.rects or
|
855
|
+
hasattr(page, 'lines') and page.lines or
|
856
|
+
hasattr(page, 'curves') and page.curves or
|
857
|
+
(hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
|
858
|
+
(hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
|
859
|
+
has_vector_elements = True
|
860
|
+
break
|
861
|
+
if has_vector_elements:
|
862
|
+
logger.warning(
|
863
|
+
"Warning: Saving with ocr=True creates an image-based PDF. "
|
864
|
+
"Original vector elements (rects, lines, non-OCR text/chars) "
|
865
|
+
"will not be preserved in the output file."
|
866
|
+
)
|
867
|
+
|
868
|
+
logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
|
869
|
+
try:
|
870
|
+
# Delegate to the searchable PDF exporter, passing self (PDF instance)
|
871
|
+
create_searchable_pdf(self, output_path_str, dpi=dpi)
|
872
|
+
except Exception as e:
|
873
|
+
raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
|
874
|
+
|
875
|
+
elif original:
|
876
|
+
if create_original_pdf is None:
|
877
|
+
raise ImportError(
|
878
|
+
"Saving with original=True requires 'pikepdf'. "
|
879
|
+
"Install with: pip install \"natural-pdf[ocr-export]\""
|
880
|
+
)
|
881
|
+
|
882
|
+
# Optional: Add warning about losing OCR data similar to PageCollection
|
883
|
+
has_ocr_elements = False
|
884
|
+
for page in self.pages:
|
885
|
+
if hasattr(page, 'find_all'):
|
886
|
+
ocr_text_elements = page.find_all("text[source=ocr]")
|
887
|
+
if ocr_text_elements:
|
888
|
+
has_ocr_elements = True
|
889
|
+
break
|
890
|
+
elif hasattr(page, 'words'): # Fallback
|
891
|
+
if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
|
892
|
+
has_ocr_elements = True
|
893
|
+
break
|
894
|
+
if has_ocr_elements:
|
895
|
+
logger.warning(
|
896
|
+
"Warning: Saving with original=True preserves original page content. "
|
897
|
+
"OCR text generated in this session will not be included in the saved file."
|
898
|
+
)
|
899
|
+
|
900
|
+
logger.info(f"Saving original PDF content to: {output_path_str}")
|
901
|
+
try:
|
902
|
+
# Delegate to the original PDF exporter, passing self (PDF instance)
|
903
|
+
create_original_pdf(self, output_path_str)
|
904
|
+
except Exception as e:
|
905
|
+
# Re-raise exception from exporter
|
906
|
+
raise e
|
788
907
|
|
789
908
|
def ask(
|
790
909
|
self,
|
@@ -849,9 +968,9 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
849
968
|
|
850
969
|
def search_within_index(
|
851
970
|
self,
|
852
|
-
query: Union[str, Path, Image.Image, Region],
|
853
|
-
search_service: SearchServiceProtocol,
|
854
|
-
options: Optional[SearchOptions] = None,
|
971
|
+
query: Union[str, Path, Image.Image, "Region"],
|
972
|
+
search_service: "SearchServiceProtocol",
|
973
|
+
options: Optional["SearchOptions"] = None,
|
855
974
|
) -> List[Dict[str, Any]]:
|
856
975
|
"""
|
857
976
|
Finds relevant documents from this PDF within a search index.
|
@@ -1243,7 +1362,7 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
1243
1362
|
|
1244
1363
|
def classify_pages(
|
1245
1364
|
self,
|
1246
|
-
|
1365
|
+
labels: List[str],
|
1247
1366
|
model: Optional[str] = None,
|
1248
1367
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1249
1368
|
analysis_key: str = "classification",
|
@@ -1254,7 +1373,7 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
1254
1373
|
Classifies specified pages of the PDF.
|
1255
1374
|
|
1256
1375
|
Args:
|
1257
|
-
|
1376
|
+
labels: List of category names
|
1258
1377
|
model: Model identifier ('text', 'vision', or specific HF ID)
|
1259
1378
|
pages: Page indices, slice, or None for all pages
|
1260
1379
|
analysis_key: Key to store results in page's analyses dict
|
@@ -1264,8 +1383,8 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
1264
1383
|
Returns:
|
1265
1384
|
Self for method chaining
|
1266
1385
|
"""
|
1267
|
-
if not
|
1268
|
-
raise ValueError("
|
1386
|
+
if not labels:
|
1387
|
+
raise ValueError("Labels list cannot be empty.")
|
1269
1388
|
|
1270
1389
|
try:
|
1271
1390
|
manager = self.get_manager("classification")
|
@@ -1332,7 +1451,7 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
1332
1451
|
try:
|
1333
1452
|
batch_results = manager.classify_batch(
|
1334
1453
|
item_contents=page_contents,
|
1335
|
-
|
1454
|
+
labels=labels,
|
1336
1455
|
model_id=model,
|
1337
1456
|
using=inferred_using,
|
1338
1457
|
**kwargs,
|
@@ -1537,3 +1656,58 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
1537
1656
|
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1538
1657
|
else:
|
1539
1658
|
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1659
|
+
|
1660
|
+
# --- Classification Mixin Implementation --- #
|
1661
|
+
|
1662
|
+
def _get_classification_manager(self) -> "ClassificationManager":
|
1663
|
+
"""Returns the ClassificationManager instance for this PDF."""
|
1664
|
+
try:
|
1665
|
+
return self.get_manager("classification")
|
1666
|
+
except (KeyError, RuntimeError) as e:
|
1667
|
+
raise AttributeError(f"Could not retrieve ClassificationManager: {e}") from e
|
1668
|
+
|
1669
|
+
def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, Image.Image]:
|
1670
|
+
"""
|
1671
|
+
Provides the content for classifying the entire PDF.
|
1672
|
+
|
1673
|
+
Args:
|
1674
|
+
model_type: 'text' or 'vision'.
|
1675
|
+
**kwargs: Additional arguments (e.g., for text extraction or image rendering).
|
1676
|
+
|
1677
|
+
Returns:
|
1678
|
+
Extracted text (str) or the first page's image (PIL.Image).
|
1679
|
+
|
1680
|
+
Raises:
|
1681
|
+
ValueError: If model_type is 'vision' and PDF has != 1 page,
|
1682
|
+
or if model_type is unsupported, or if content cannot be generated.
|
1683
|
+
"""
|
1684
|
+
if model_type == "text":
|
1685
|
+
try:
|
1686
|
+
# Extract text from the whole document
|
1687
|
+
text = self.extract_text(**kwargs) # Pass relevant kwargs
|
1688
|
+
if not text or text.isspace():
|
1689
|
+
raise ValueError("PDF contains no extractable text for classification.")
|
1690
|
+
return text
|
1691
|
+
except Exception as e:
|
1692
|
+
logger.error(f"Error extracting text for PDF classification: {e}")
|
1693
|
+
raise ValueError("Failed to extract text for classification.") from e
|
1694
|
+
|
1695
|
+
elif model_type == "vision":
|
1696
|
+
if len(self.pages) == 1:
|
1697
|
+
# Use the single page's content method
|
1698
|
+
try:
|
1699
|
+
return self.pages[0]._get_classification_content(model_type="vision", **kwargs)
|
1700
|
+
except Exception as e:
|
1701
|
+
logger.error(f"Error getting image from single page for classification: {e}")
|
1702
|
+
raise ValueError("Failed to get image from single page.") from e
|
1703
|
+
elif len(self.pages) == 0:
|
1704
|
+
raise ValueError("Cannot classify empty PDF using vision model.")
|
1705
|
+
else:
|
1706
|
+
raise ValueError(
|
1707
|
+
f"Vision classification for a PDF object is only supported for single-page PDFs. "
|
1708
|
+
f"This PDF has {len(self.pages)} pages. Use pdf.pages[0].classify() or pdf.classify_pages()."
|
1709
|
+
)
|
1710
|
+
else:
|
1711
|
+
raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
|
1712
|
+
|
1713
|
+
# --- End Classification Mixin Implementation ---
|