natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +29 -40
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +120 -40
- natural_pdf/core/page.py +20 -18
- natural_pdf/core/pdf.py +146 -13
- natural_pdf/elements/base.py +17 -0
- natural_pdf/elements/collections.py +374 -30
- natural_pdf/elements/region.py +45 -14
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +519 -0
- natural_pdf/exporters/hocr_font.py +136 -0
- natural_pdf/exporters/original_pdf.py +127 -0
- natural_pdf/exporters/searchable_pdf.py +2 -12
- natural_pdf/ocr/engine_surya.py +1 -1
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -407,7 +407,17 @@ class ElementManager:
|
|
407
407
|
char_dict_data = ocr_char_dict # Use the one we already created
|
408
408
|
char_dict_data["object_type"] = "char" # Mark as char type
|
409
409
|
char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
|
410
|
-
|
410
|
+
|
411
|
+
# Create a TextElement for the char representation
|
412
|
+
# Ensure _char_dicts is handled correctly by TextElement constructor
|
413
|
+
# For an OCR word represented as a char, its _char_dicts can be a list containing its own data
|
414
|
+
char_element_specific_data = char_dict_data.copy()
|
415
|
+
char_element_specific_data["_char_dicts"] = [char_dict_data.copy()]
|
416
|
+
|
417
|
+
ocr_char_as_element = TextElement(char_element_specific_data, self._page)
|
418
|
+
self._elements["chars"].append(
|
419
|
+
ocr_char_as_element
|
420
|
+
) # Append TextElement instance
|
411
421
|
|
412
422
|
except (KeyError, ValueError, TypeError) as e:
|
413
423
|
logger.error(f"Failed to process OCR result: {result}. Error: {e}", exc_info=True)
|
@@ -611,13 +611,13 @@ class HighlightingService:
|
|
611
611
|
|
612
612
|
Args:
|
613
613
|
page_index: The 0-based index of the page to render.
|
614
|
-
scale: Scale factor for rendering highlights.
|
614
|
+
scale: Scale factor for rendering highlights if width/height/resolution not in kwargs.
|
615
615
|
labels: Whether to include a legend for highlights.
|
616
616
|
legend_position: Position of the legend.
|
617
617
|
render_ocr: Whether to render OCR text on the image.
|
618
|
-
resolution: Optional resolution (DPI) for the base page image.
|
619
|
-
Defaults to scale * 72.
|
620
|
-
kwargs: Additional keyword arguments for pdfplumber's page.to_image.
|
618
|
+
resolution: Optional resolution (DPI) for the base page image if width/height not in kwargs.
|
619
|
+
Defaults to scale * 72 if not otherwise specified.
|
620
|
+
kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
|
621
621
|
|
622
622
|
Returns:
|
623
623
|
A PIL Image object of the rendered page, or None if rendering fails.
|
@@ -626,34 +626,81 @@ class HighlightingService:
|
|
626
626
|
logger.error(f"Invalid page index {page_index} for rendering.")
|
627
627
|
return None
|
628
628
|
|
629
|
-
|
629
|
+
page_obj = self._pdf[page_index] # Renamed to avoid conflict
|
630
630
|
highlights_on_page = self.get_highlights_for_page(page_index)
|
631
631
|
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
632
|
+
to_image_args = kwargs.copy()
|
633
|
+
actual_scale_x = None
|
634
|
+
actual_scale_y = None
|
635
|
+
|
636
|
+
if "width" in to_image_args and to_image_args["width"] is not None:
|
637
|
+
logger.debug(f"Rendering page {page_index} with width={to_image_args['width']}.")
|
638
|
+
if "height" in to_image_args: to_image_args.pop("height", None)
|
639
|
+
# Actual scale will be calculated after image creation
|
640
|
+
elif "height" in to_image_args and to_image_args["height"] is not None:
|
641
|
+
logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
|
642
|
+
# Actual scale will be calculated after image creation
|
643
|
+
else:
|
644
|
+
# Use explicit resolution from kwargs if present, then the resolution param, then scale
|
645
|
+
render_resolution = to_image_args.pop("resolution", resolution) # Use and remove from kwargs if present
|
646
|
+
if render_resolution is None:
|
647
|
+
render_resolution = scale * 72
|
648
|
+
to_image_args["resolution"] = render_resolution # Add it back for the call
|
649
|
+
actual_scale_x = render_resolution / 72.0
|
650
|
+
actual_scale_y = render_resolution / 72.0
|
651
|
+
logger.debug(f"Rendering page {page_index} with resolution {render_resolution} (scale: {actual_scale_x:.2f}).")
|
652
|
+
|
653
|
+
try:
|
654
|
+
# base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
|
655
|
+
img_object = page_obj._page.to_image(**to_image_args)
|
656
|
+
base_image_pil = (
|
657
|
+
img_object.annotated
|
658
|
+
if hasattr(img_object, "annotated")
|
659
|
+
else img_object._repr_png_()
|
660
|
+
)
|
661
|
+
if isinstance(base_image_pil, bytes):
|
662
|
+
from io import BytesIO
|
663
|
+
base_image_pil = Image.open(BytesIO(base_image_pil))
|
664
|
+
base_image_pil = base_image_pil.convert("RGBA") # Ensure RGBA for renderer
|
665
|
+
logger.debug(
|
666
|
+
f"Base image for page {page_index} rendered. Size: {base_image_pil.size}."
|
667
|
+
)
|
668
|
+
|
669
|
+
if actual_scale_x is None or actual_scale_y is None: # If not set by resolution path
|
670
|
+
if page_obj.width > 0:
|
671
|
+
actual_scale_x = base_image_pil.width / page_obj.width
|
672
|
+
else:
|
673
|
+
actual_scale_x = scale # Fallback
|
674
|
+
if page_obj.height > 0:
|
675
|
+
actual_scale_y = base_image_pil.height / page_obj.height
|
676
|
+
else:
|
677
|
+
actual_scale_y = scale # Fallback
|
678
|
+
logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
|
679
|
+
|
680
|
+
except Exception as e:
|
681
|
+
logger.error(f"Error creating base image for page {page_index}: {e}", exc_info=True)
|
682
|
+
return None
|
683
|
+
|
684
|
+
renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
|
638
685
|
|
639
686
|
# --- Render Highlights ---
|
640
687
|
rendered_image: Image.Image
|
641
688
|
if highlights_on_page:
|
642
689
|
renderer = HighlightRenderer(
|
643
|
-
page=
|
644
|
-
base_image=
|
690
|
+
page=page_obj,
|
691
|
+
base_image=base_image_pil,
|
645
692
|
highlights=highlights_on_page,
|
646
|
-
scale=scale
|
693
|
+
scale=renderer_scale, # Use the determined actual scale
|
647
694
|
render_ocr=render_ocr,
|
648
695
|
)
|
649
696
|
rendered_image = renderer.render()
|
650
697
|
else:
|
651
698
|
if render_ocr:
|
652
|
-
# Still render OCR even if no highlights
|
653
|
-
renderer = HighlightRenderer(
|
699
|
+
# Still render OCR even if no highlights, using the determined actual scale
|
700
|
+
renderer = HighlightRenderer(page_obj, base_image_pil, [], renderer_scale, True)
|
654
701
|
rendered_image = renderer.render()
|
655
702
|
else:
|
656
|
-
rendered_image =
|
703
|
+
rendered_image = base_image_pil # No highlights, no OCR requested
|
657
704
|
|
658
705
|
# --- Add Legend (Based ONLY on this page's highlights) ---
|
659
706
|
if labels:
|
@@ -697,12 +744,12 @@ class HighlightingService:
|
|
697
744
|
Args:
|
698
745
|
page_index: Index of the page to render.
|
699
746
|
temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
|
700
|
-
scale:
|
747
|
+
scale: Original scale factor for rendering, used if width/height are not provided.
|
701
748
|
labels: Whether to include a legend.
|
702
749
|
legend_position: Position of the legend.
|
703
750
|
render_ocr: Whether to render OCR text.
|
704
|
-
resolution: Resolution for base page image rendering.
|
705
|
-
**kwargs: Additional args for pdfplumber's to_image.
|
751
|
+
resolution: Resolution for base page image rendering if width/height not used.
|
752
|
+
**kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
|
706
753
|
|
707
754
|
Returns:
|
708
755
|
PIL Image of the preview, or None if rendering fails.
|
@@ -711,35 +758,64 @@ class HighlightingService:
|
|
711
758
|
logger.error(f"Invalid page index {page_index} for render_preview.")
|
712
759
|
return None
|
713
760
|
|
714
|
-
|
715
|
-
|
761
|
+
page_obj = self._pdf.pages[page_index]
|
762
|
+
|
763
|
+
to_image_args = kwargs.copy()
|
764
|
+
actual_scale_x = None
|
765
|
+
actual_scale_y = None
|
766
|
+
|
767
|
+
# Determine arguments for page._page.to_image()
|
768
|
+
if "width" in to_image_args and to_image_args["width"] is not None:
|
769
|
+
logger.debug(f"Rendering preview for page {page_index} with width={to_image_args['width']}.")
|
770
|
+
# Resolution is implicitly handled by pdfplumber when width is set
|
771
|
+
if "height" in to_image_args:
|
772
|
+
to_image_args.pop("height", None)
|
773
|
+
# after image is created, we will calculate actual_scale_x and actual_scale_y
|
774
|
+
|
775
|
+
elif "height" in to_image_args and to_image_args["height"] is not None:
|
776
|
+
logger.debug(f"Rendering preview for page {page_index} with height={to_image_args['height']}.")
|
777
|
+
# Resolution is implicitly handled by pdfplumber when height is set
|
778
|
+
# after image is created, we will calculate actual_scale_x and actual_scale_y
|
779
|
+
else:
|
780
|
+
# Neither width nor height is provided, use resolution or scale.
|
781
|
+
render_resolution = resolution if resolution is not None else scale * 72
|
782
|
+
to_image_args["resolution"] = render_resolution
|
783
|
+
actual_scale_x = render_resolution / 72.0
|
784
|
+
actual_scale_y = render_resolution / 72.0
|
785
|
+
logger.debug(f"Rendering preview for page {page_index} with resolution={render_resolution} (scale: {actual_scale_x:.2f}).")
|
716
786
|
|
717
787
|
try:
|
718
|
-
|
719
|
-
|
720
|
-
base_image = (
|
788
|
+
img_object = page_obj._page.to_image(**to_image_args)
|
789
|
+
base_image_pil = (
|
721
790
|
img_object.annotated
|
722
791
|
if hasattr(img_object, "annotated")
|
723
792
|
else img_object._repr_png_()
|
724
793
|
)
|
725
|
-
if isinstance(
|
794
|
+
if isinstance(base_image_pil, bytes):
|
726
795
|
from io import BytesIO
|
796
|
+
base_image_pil = Image.open(BytesIO(base_image_pil))
|
797
|
+
base_image_pil = base_image_pil.convert("RGB")
|
727
798
|
|
728
|
-
|
729
|
-
|
799
|
+
# If scale was not determined by resolution, calculate it now from base_image_pil dimensions
|
800
|
+
if actual_scale_x is None or actual_scale_y is None:
|
801
|
+
if page_obj.width > 0:
|
802
|
+
actual_scale_x = base_image_pil.width / page_obj.width
|
803
|
+
else:
|
804
|
+
actual_scale_x = scale # Fallback to original scale
|
805
|
+
if page_obj.height > 0:
|
806
|
+
actual_scale_y = base_image_pil.height / page_obj.height
|
807
|
+
else:
|
808
|
+
actual_scale_y = scale # Fallback to original scale
|
809
|
+
logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})")
|
730
810
|
|
731
811
|
# Convert temporary highlight dicts to Highlight objects
|
732
|
-
# Note: Colors/labels should be determined *here* for temporary preview
|
733
812
|
preview_highlights = []
|
734
813
|
for hl_data in temporary_highlights:
|
735
|
-
# Determine the final color using the service logic
|
736
814
|
final_color = self._determine_highlight_color(
|
737
815
|
color_input=hl_data.get("color"),
|
738
816
|
label=hl_data.get("label"),
|
739
817
|
use_color_cycling=hl_data.get("use_color_cycling", False),
|
740
818
|
)
|
741
|
-
|
742
|
-
# Extract potential attributes to draw
|
743
819
|
attrs_to_draw = {}
|
744
820
|
element = hl_data.get("element")
|
745
821
|
include_attrs = hl_data.get("include_attrs")
|
@@ -753,25 +829,29 @@ class HighlightingService:
|
|
753
829
|
logger.warning(
|
754
830
|
f"Attribute '{attr_name}' not found on element {element}"
|
755
831
|
)
|
756
|
-
|
757
|
-
# Add highlight if geometry exists
|
758
832
|
if hl_data.get("bbox") or hl_data.get("polygon"):
|
759
833
|
preview_highlights.append(
|
760
834
|
Highlight(
|
761
835
|
page_index=hl_data["page_index"],
|
762
836
|
bbox=hl_data.get("bbox"),
|
763
837
|
polygon=hl_data.get("polygon"),
|
764
|
-
color=final_color,
|
838
|
+
color=final_color,
|
765
839
|
label=hl_data.get("label"),
|
766
840
|
attributes=attrs_to_draw,
|
767
841
|
)
|
768
842
|
)
|
769
|
-
|
770
|
-
#
|
771
|
-
|
843
|
+
|
844
|
+
# Use the calculated actual_scale_x for the HighlightRenderer
|
845
|
+
# Assuming HighlightRenderer can handle a single scale or we adapt it.
|
846
|
+
# For now, pdfplumber usually maintains aspect ratio, so one scale should be okay.
|
847
|
+
# If not, HighlightRenderer needs to accept scale_x and scale_y.
|
848
|
+
# We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
|
849
|
+
# or if not, it's a reasonable approximation for highlight scaling.
|
850
|
+
renderer_scale = actual_scale_x
|
851
|
+
|
852
|
+
renderer = HighlightRenderer(page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr)
|
772
853
|
rendered_image = renderer.render()
|
773
854
|
|
774
|
-
# Create legend only from temporary highlights
|
775
855
|
legend = None
|
776
856
|
if labels:
|
777
857
|
preview_labels = {h.label: h.color for h in preview_highlights if h.label}
|
@@ -781,7 +861,7 @@ class HighlightingService:
|
|
781
861
|
rendered_image, legend, position=legend_position
|
782
862
|
)
|
783
863
|
else:
|
784
|
-
final_image = rendered_image
|
864
|
+
final_image = rendered_image
|
785
865
|
else:
|
786
866
|
final_image = rendered_image
|
787
867
|
|
natural_pdf/core/page.py
CHANGED
@@ -40,10 +40,10 @@ if TYPE_CHECKING:
|
|
40
40
|
from natural_pdf.elements.base import Element
|
41
41
|
from natural_pdf.elements.collections import ElementCollection
|
42
42
|
|
43
|
-
# New Imports
|
43
|
+
# # New Imports
|
44
44
|
import itertools
|
45
45
|
|
46
|
-
# Deskew Imports (Conditional)
|
46
|
+
# # Deskew Imports (Conditional)
|
47
47
|
import numpy as np
|
48
48
|
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
49
49
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
@@ -55,7 +55,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
|
|
55
55
|
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
56
56
|
from natural_pdf.classification.manager import ClassificationManager # For type hint
|
57
57
|
|
58
|
-
# --- Classification Imports --- #
|
58
|
+
# # --- Classification Imports --- #
|
59
59
|
from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
|
60
60
|
from natural_pdf.core.element_manager import ElementManager
|
61
61
|
from natural_pdf.elements.base import Element # Import base element
|
@@ -66,7 +66,7 @@ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
|
66
66
|
from natural_pdf.qa import DocumentQA, get_qa_engine
|
67
67
|
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
68
68
|
|
69
|
-
# Import new utils
|
69
|
+
# # Import new utils
|
70
70
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
71
71
|
from natural_pdf.widgets import InteractiveViewerWidget
|
72
72
|
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
|
@@ -210,7 +210,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
210
210
|
|
211
211
|
def add_exclusion(
|
212
212
|
self,
|
213
|
-
exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
|
213
|
+
exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
|
214
214
|
label: Optional[str] = None,
|
215
215
|
) -> "Page":
|
216
216
|
"""
|
@@ -274,7 +274,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
274
274
|
|
275
275
|
return self
|
276
276
|
|
277
|
-
def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
|
277
|
+
def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
|
278
278
|
"""
|
279
279
|
Add a region to the page.
|
280
280
|
|
@@ -305,7 +305,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
305
305
|
|
306
306
|
return self
|
307
307
|
|
308
|
-
def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
|
308
|
+
def add_regions(self, regions: List["Region"], prefix: Optional[str] = None) -> "Page":
|
309
309
|
"""
|
310
310
|
Add multiple regions to the page.
|
311
311
|
|
@@ -327,7 +327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
327
327
|
|
328
328
|
return self
|
329
329
|
|
330
|
-
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
|
330
|
+
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List["Region"]:
|
331
331
|
"""
|
332
332
|
Get all exclusion regions for this page.
|
333
333
|
Assumes self._exclusions contains tuples of (callable/Region, label).
|
@@ -1349,7 +1349,9 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1349
1349
|
self._highlighter.clear_page(self.index)
|
1350
1350
|
return self
|
1351
1351
|
|
1352
|
-
def analyze_text_styles(
|
1352
|
+
def analyze_text_styles(
|
1353
|
+
self, options: Optional[TextStyleOptions] = None
|
1354
|
+
) -> "ElementCollection":
|
1353
1355
|
"""
|
1354
1356
|
Analyze text elements by style, adding attributes directly to elements.
|
1355
1357
|
|
@@ -1520,7 +1522,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1520
1522
|
|
1521
1523
|
def _create_text_elements_from_ocr(
|
1522
1524
|
self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
|
1523
|
-
) -> List[TextElement]:
|
1525
|
+
) -> List["TextElement"]:
|
1524
1526
|
"""DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
|
1525
1527
|
logger.warning(
|
1526
1528
|
"_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
|
@@ -1532,7 +1534,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1532
1534
|
def apply_ocr(
|
1533
1535
|
self,
|
1534
1536
|
engine: Optional[str] = None,
|
1535
|
-
options: Optional[OCROptions] = None,
|
1537
|
+
options: Optional["OCROptions"] = None,
|
1536
1538
|
languages: Optional[List[str]] = None,
|
1537
1539
|
min_confidence: Optional[float] = None,
|
1538
1540
|
device: Optional[str] = None,
|
@@ -1597,12 +1599,12 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1597
1599
|
def extract_ocr_elements(
|
1598
1600
|
self,
|
1599
1601
|
engine: Optional[str] = None,
|
1600
|
-
options: Optional[OCROptions] = None,
|
1602
|
+
options: Optional["OCROptions"] = None,
|
1601
1603
|
languages: Optional[List[str]] = None,
|
1602
1604
|
min_confidence: Optional[float] = None,
|
1603
1605
|
device: Optional[str] = None,
|
1604
1606
|
resolution: Optional[int] = None,
|
1605
|
-
) -> List[TextElement]:
|
1607
|
+
) -> List["TextElement"]:
|
1606
1608
|
"""
|
1607
1609
|
Extract text elements using OCR *without* adding them to the page's elements.
|
1608
1610
|
Uses the shared OCRManager instance.
|
@@ -1716,7 +1718,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1716
1718
|
return (self._page.width, self._page.height)
|
1717
1719
|
|
1718
1720
|
@property
|
1719
|
-
def layout_analyzer(self) -> LayoutAnalyzer:
|
1721
|
+
def layout_analyzer(self) -> "LayoutAnalyzer":
|
1720
1722
|
"""Get or create the layout analyzer for this page."""
|
1721
1723
|
if self._layout_analyzer is None:
|
1722
1724
|
if not self._layout_manager:
|
@@ -1728,7 +1730,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1728
1730
|
def analyze_layout(
|
1729
1731
|
self,
|
1730
1732
|
engine: Optional[str] = None,
|
1731
|
-
options: Optional[LayoutOptions] = None,
|
1733
|
+
options: Optional["LayoutOptions"] = None,
|
1732
1734
|
confidence: Optional[float] = None,
|
1733
1735
|
classes: Optional[List[str]] = None,
|
1734
1736
|
exclude_classes: Optional[List[str]] = None,
|
@@ -1736,7 +1738,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1736
1738
|
existing: str = "replace",
|
1737
1739
|
model_name: Optional[str] = None,
|
1738
1740
|
client: Optional[Any] = None, # Add client parameter
|
1739
|
-
) -> ElementCollection[Region]:
|
1741
|
+
) -> "ElementCollection[Region]":
|
1740
1742
|
"""
|
1741
1743
|
Analyze the page layout using the configured LayoutManager.
|
1742
1744
|
Adds detected Region objects to the page's element manager.
|
@@ -1813,7 +1815,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1813
1815
|
|
1814
1816
|
def get_section_between(
|
1815
1817
|
self, start_element=None, end_element=None, boundary_inclusion="both"
|
1816
|
-
) -> Optional[Region]: # Return Optional
|
1818
|
+
) -> Optional["Region"]: # Return Optional
|
1817
1819
|
"""
|
1818
1820
|
Get a section between two elements on this page.
|
1819
1821
|
"""
|
@@ -2130,7 +2132,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2130
2132
|
if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
|
2131
2133
|
logger.error(
|
2132
2134
|
"Interactive viewer requires optional dependencies ('ipywidgets'). "
|
2133
|
-
"Install with `pip install natural-pdf[
|
2135
|
+
"Install with `pip install natural-pdf[viewer]`"
|
2134
2136
|
)
|
2135
2137
|
# raise ImportError("ipywidgets not found.") # Option 1: Raise error
|
2136
2138
|
return None # Option 2: Return None gracefully
|
natural_pdf/core/pdf.py
CHANGED
@@ -61,6 +61,15 @@ except ImportError:
|
|
61
61
|
)
|
62
62
|
|
63
63
|
|
64
|
+
try:
|
65
|
+
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
66
|
+
except ImportError:
|
67
|
+
create_searchable_pdf = None
|
68
|
+
try:
|
69
|
+
from natural_pdf.exporters.original_pdf import create_original_pdf
|
70
|
+
except ImportError:
|
71
|
+
create_original_pdf = None
|
72
|
+
|
64
73
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
65
74
|
tqdm = get_tqdm()
|
66
75
|
|
@@ -260,7 +269,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
260
269
|
return self
|
261
270
|
|
262
271
|
def add_exclusion(
|
263
|
-
self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
|
272
|
+
self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
|
264
273
|
) -> "PDF":
|
265
274
|
"""
|
266
275
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
@@ -468,7 +477,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
468
477
|
return self
|
469
478
|
|
470
479
|
def add_region(
|
471
|
-
self, region_func: Callable[["Page"], Optional[Region]], name: str = None
|
480
|
+
self, region_func: Callable[["Page"], Optional["Region"]], name: str = None
|
472
481
|
) -> "PDF":
|
473
482
|
"""
|
474
483
|
Add a region function to the PDF.
|
@@ -769,23 +778,137 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
769
778
|
|
770
779
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
771
780
|
"""
|
781
|
+
DEPRECATED: Use save_pdf(..., ocr=True) instead.
|
772
782
|
Saves the PDF with an OCR text layer, making content searchable.
|
773
783
|
|
774
|
-
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-
|
784
|
+
Requires optional dependencies. Install with: pip install \"natural-pdf[ocr-export]\"
|
775
785
|
|
776
786
|
Args:
|
777
787
|
output_path: Path to save the searchable PDF
|
778
788
|
dpi: Resolution for rendering and OCR overlay
|
779
789
|
**kwargs: Additional keyword arguments passed to the exporter
|
780
|
-
output_path: Path to save the searchable PDF
|
781
|
-
dpi: Resolution for rendering and OCR overlay
|
782
|
-
**kwargs: Additional keyword arguments passed to the exporter
|
783
790
|
"""
|
784
|
-
|
785
|
-
|
791
|
+
logger.warning(
|
792
|
+
"PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
|
793
|
+
)
|
794
|
+
if create_searchable_pdf is None:
|
795
|
+
raise ImportError(
|
796
|
+
"Saving searchable PDF requires 'pikepdf'. "
|
797
|
+
'Install with: pip install "natural-pdf[ocr-export]"'
|
798
|
+
)
|
786
799
|
output_path_str = str(output_path)
|
800
|
+
# Call the exporter directly, passing self (the PDF instance)
|
787
801
|
create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
|
788
|
-
|
802
|
+
# Logger info is handled within the exporter now
|
803
|
+
# logger.info(f"Searchable PDF saved to: {output_path_str}")
|
804
|
+
|
805
|
+
def save_pdf(
|
806
|
+
self,
|
807
|
+
output_path: Union[str, Path],
|
808
|
+
ocr: bool = False,
|
809
|
+
original: bool = False,
|
810
|
+
dpi: int = 300,
|
811
|
+
):
|
812
|
+
"""
|
813
|
+
Saves the PDF object (all its pages) to a new file.
|
814
|
+
|
815
|
+
Choose one saving mode:
|
816
|
+
- `ocr=True`: Creates a new, image-based PDF using OCR results from all pages.
|
817
|
+
Text generated during the natural-pdf session becomes searchable,
|
818
|
+
but original vector content is lost. Requires 'ocr-export' extras.
|
819
|
+
- `original=True`: Saves a copy of the original PDF file this object represents.
|
820
|
+
Any OCR results or analyses from the natural-pdf session are NOT included.
|
821
|
+
If the PDF was opened from an in-memory buffer, this mode may not be suitable.
|
822
|
+
Requires 'ocr-export' extras.
|
823
|
+
|
824
|
+
Args:
|
825
|
+
output_path: Path to save the new PDF file.
|
826
|
+
ocr: If True, save as a searchable, image-based PDF using OCR data.
|
827
|
+
original: If True, save the original source PDF content.
|
828
|
+
dpi: Resolution (dots per inch) used only when ocr=True.
|
829
|
+
|
830
|
+
Raises:
|
831
|
+
ValueError: If the PDF has no pages, if neither or both 'ocr'
|
832
|
+
and 'original' are True.
|
833
|
+
ImportError: If required libraries are not installed for the chosen mode.
|
834
|
+
RuntimeError: If an unexpected error occurs during saving.
|
835
|
+
"""
|
836
|
+
if not self.pages:
|
837
|
+
raise ValueError("Cannot save an empty PDF object.")
|
838
|
+
|
839
|
+
if not (ocr ^ original): # XOR: exactly one must be true
|
840
|
+
raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
|
841
|
+
|
842
|
+
output_path_obj = Path(output_path)
|
843
|
+
output_path_str = str(output_path_obj)
|
844
|
+
|
845
|
+
if ocr:
|
846
|
+
has_vector_elements = False
|
847
|
+
for page in self.pages:
|
848
|
+
if (
|
849
|
+
hasattr(page, "rects")
|
850
|
+
and page.rects
|
851
|
+
or hasattr(page, "lines")
|
852
|
+
and page.lines
|
853
|
+
or hasattr(page, "curves")
|
854
|
+
and page.curves
|
855
|
+
or (
|
856
|
+
hasattr(page, "chars")
|
857
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.chars)
|
858
|
+
)
|
859
|
+
or (
|
860
|
+
hasattr(page, "words")
|
861
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.words)
|
862
|
+
)
|
863
|
+
):
|
864
|
+
has_vector_elements = True
|
865
|
+
break
|
866
|
+
if has_vector_elements:
|
867
|
+
logger.warning(
|
868
|
+
"Warning: Saving with ocr=True creates an image-based PDF. "
|
869
|
+
"Original vector elements (rects, lines, non-OCR text/chars) "
|
870
|
+
"will not be preserved in the output file."
|
871
|
+
)
|
872
|
+
|
873
|
+
logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
|
874
|
+
try:
|
875
|
+
# Delegate to the searchable PDF exporter, passing self (PDF instance)
|
876
|
+
create_searchable_pdf(self, output_path_str, dpi=dpi)
|
877
|
+
except Exception as e:
|
878
|
+
raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
|
879
|
+
|
880
|
+
elif original:
|
881
|
+
if create_original_pdf is None:
|
882
|
+
raise ImportError(
|
883
|
+
"Saving with original=True requires 'pikepdf'. "
|
884
|
+
'Install with: pip install "natural-pdf[ocr-export]"'
|
885
|
+
)
|
886
|
+
|
887
|
+
# Optional: Add warning about losing OCR data similar to PageCollection
|
888
|
+
has_ocr_elements = False
|
889
|
+
for page in self.pages:
|
890
|
+
if hasattr(page, "find_all"):
|
891
|
+
ocr_text_elements = page.find_all("text[source=ocr]")
|
892
|
+
if ocr_text_elements:
|
893
|
+
has_ocr_elements = True
|
894
|
+
break
|
895
|
+
elif hasattr(page, "words"): # Fallback
|
896
|
+
if any(getattr(el, "source", None) == "ocr" for el in page.words):
|
897
|
+
has_ocr_elements = True
|
898
|
+
break
|
899
|
+
if has_ocr_elements:
|
900
|
+
logger.warning(
|
901
|
+
"Warning: Saving with original=True preserves original page content. "
|
902
|
+
"OCR text generated in this session will not be included in the saved file."
|
903
|
+
)
|
904
|
+
|
905
|
+
logger.info(f"Saving original PDF content to: {output_path_str}")
|
906
|
+
try:
|
907
|
+
# Delegate to the original PDF exporter, passing self (PDF instance)
|
908
|
+
create_original_pdf(self, output_path_str)
|
909
|
+
except Exception as e:
|
910
|
+
# Re-raise exception from exporter
|
911
|
+
raise e
|
789
912
|
|
790
913
|
def ask(
|
791
914
|
self,
|
@@ -850,9 +973,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
850
973
|
|
851
974
|
def search_within_index(
|
852
975
|
self,
|
853
|
-
query: Union[str, Path, Image.Image, Region],
|
854
|
-
search_service: SearchServiceProtocol,
|
855
|
-
options: Optional[SearchOptions] = None,
|
976
|
+
query: Union[str, Path, Image.Image, "Region"],
|
977
|
+
search_service: "SearchServiceProtocol",
|
978
|
+
options: Optional["SearchOptions"] = None,
|
856
979
|
) -> List[Dict[str, Any]]:
|
857
980
|
"""
|
858
981
|
Finds relevant documents from this PDF within a search index.
|
@@ -1109,6 +1232,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1109
1232
|
"""Context manager exit."""
|
1110
1233
|
self.close()
|
1111
1234
|
|
1235
|
+
def __repr__(self) -> str:
|
1236
|
+
"""Return a string representation of the PDF object."""
|
1237
|
+
if not hasattr(self, "_pages"):
|
1238
|
+
page_count_str = "uninitialized"
|
1239
|
+
else:
|
1240
|
+
page_count_str = str(len(self._pages))
|
1241
|
+
|
1242
|
+
source_info = getattr(self, "source_path", "unknown source")
|
1243
|
+
return f"<PDF source='{source_info}' pages={page_count_str}>"
|
1244
|
+
|
1112
1245
|
def get_id(self) -> str:
|
1113
1246
|
"""Get unique identifier for this PDF."""
|
1114
1247
|
"""Get unique identifier for this PDF."""
|
@@ -1282,7 +1415,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1282
1415
|
except ImportError:
|
1283
1416
|
raise ImportError(
|
1284
1417
|
"Classification dependencies missing. "
|
1285
|
-
'Install with: pip install "natural-pdf[
|
1418
|
+
'Install with: pip install "natural-pdf[core-ml]"'
|
1286
1419
|
)
|
1287
1420
|
raise ClassificationError("ClassificationManager not available.")
|
1288
1421
|
|