natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +117 -75
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +200 -126
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
3
3
|
|
4
|
-
|
5
|
-
try:
|
6
|
-
from PIL import Image
|
7
|
-
except ImportError:
|
8
|
-
Image = None # type: ignore
|
4
|
+
from PIL import Image
|
9
5
|
|
10
|
-
|
11
|
-
from .results import ClassificationResult # Assuming results.py is in the same dir
|
6
|
+
from .results import ClassificationResult
|
12
7
|
|
13
8
|
if TYPE_CHECKING:
|
14
|
-
# Avoid runtime import cycle
|
15
9
|
from natural_pdf.core.page import Page
|
16
10
|
from natural_pdf.elements.region import Region
|
17
11
|
|
@@ -61,14 +61,16 @@ except ImportError as e:
|
|
61
61
|
|
62
62
|
SearchServiceProtocol, SearchOptions, Indexable = object, object, object
|
63
63
|
|
64
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
65
|
+
|
64
66
|
# Import the ApplyMixin
|
65
67
|
from natural_pdf.collections.mixins import ApplyMixin
|
66
68
|
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
67
69
|
|
68
|
-
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
69
70
|
|
70
|
-
|
71
|
-
|
71
|
+
class PDFCollection(
|
72
|
+
SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin
|
73
|
+
): # Add ExportMixin and ShapeDetectionMixin
|
72
74
|
def __init__(
|
73
75
|
self,
|
74
76
|
source: Union[str, Iterable[Union[str, "PDF"]]],
|
@@ -120,6 +122,7 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixi
|
|
120
122
|
def _get_pdf_class():
|
121
123
|
"""Helper method to dynamically import the PDF class."""
|
122
124
|
from natural_pdf.core.pdf import PDF
|
125
|
+
|
123
126
|
return PDF
|
124
127
|
|
125
128
|
# --- Internal Helpers ---
|
@@ -382,33 +385,25 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixi
|
|
382
385
|
pdf_path = pdf.path # Get path for logging
|
383
386
|
logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
|
384
387
|
start_time = time.monotonic()
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
return pdf_path, None
|
405
|
-
except Exception as e:
|
406
|
-
end_time = time.monotonic()
|
407
|
-
logger.error(
|
408
|
-
f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
|
409
|
-
exc_info=False,
|
410
|
-
)
|
411
|
-
return pdf_path, e # Return path and error
|
388
|
+
pdf.apply_ocr( # Call apply_ocr on the original PDF object
|
389
|
+
pages=pages,
|
390
|
+
engine=engine,
|
391
|
+
languages=languages,
|
392
|
+
min_confidence=min_confidence,
|
393
|
+
device=device,
|
394
|
+
resolution=resolution,
|
395
|
+
apply_exclusions=apply_exclusions,
|
396
|
+
detect_only=detect_only,
|
397
|
+
replace=replace,
|
398
|
+
options=options,
|
399
|
+
# Note: We might want a max_workers here too for page rendering?
|
400
|
+
# For now, PDF.apply_ocr doesn't have it.
|
401
|
+
)
|
402
|
+
end_time = time.monotonic()
|
403
|
+
logger.debug(
|
404
|
+
f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
|
405
|
+
)
|
406
|
+
return pdf_path, None
|
412
407
|
|
413
408
|
# Use ThreadPoolExecutor for parallel processing if max_workers > 1
|
414
409
|
if max_workers is not None and max_workers > 1:
|
@@ -219,9 +219,7 @@ class HighlightRenderer:
|
|
219
219
|
ocr_elements = self.page.find_all("text[source=ocr]")
|
220
220
|
if not ocr_elements:
|
221
221
|
# Don't run full OCR here, just extract if already run
|
222
|
-
ocr_elements = [
|
223
|
-
el for el in self.page.words if getattr(el, "source", None) == "ocr"
|
224
|
-
]
|
222
|
+
ocr_elements = [el for el in self.page.words if getattr(el, "source", None) == "ocr"]
|
225
223
|
# Alternative: self.page.extract_ocr_elements() - but might be slow
|
226
224
|
|
227
225
|
if not ocr_elements:
|
@@ -611,7 +609,7 @@ class HighlightingService:
|
|
611
609
|
logger.error(f"Invalid page index {page_index} for rendering.")
|
612
610
|
return None
|
613
611
|
|
614
|
-
page_obj = self._pdf[page_index]
|
612
|
+
page_obj = self._pdf[page_index] # Renamed to avoid conflict
|
615
613
|
highlights_on_page = self.get_highlights_for_page(page_index)
|
616
614
|
|
617
615
|
to_image_args = kwargs.copy()
|
@@ -620,20 +618,25 @@ class HighlightingService:
|
|
620
618
|
|
621
619
|
if "width" in to_image_args and to_image_args["width"] is not None:
|
622
620
|
logger.debug(f"Rendering page {page_index} with width={to_image_args['width']}.")
|
623
|
-
if "height" in to_image_args:
|
621
|
+
if "height" in to_image_args:
|
622
|
+
to_image_args.pop("height", None)
|
624
623
|
# Actual scale will be calculated after image creation
|
625
624
|
elif "height" in to_image_args and to_image_args["height"] is not None:
|
626
625
|
logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
|
627
626
|
# Actual scale will be calculated after image creation
|
628
627
|
else:
|
629
628
|
# Use explicit resolution from kwargs if present, then the resolution param, then scale
|
630
|
-
render_resolution = to_image_args.pop(
|
629
|
+
render_resolution = to_image_args.pop(
|
630
|
+
"resolution", resolution
|
631
|
+
) # Use and remove from kwargs if present
|
631
632
|
if render_resolution is None:
|
632
633
|
render_resolution = scale * 72
|
633
|
-
to_image_args["resolution"] = render_resolution
|
634
|
+
to_image_args["resolution"] = render_resolution # Add it back for the call
|
634
635
|
actual_scale_x = render_resolution / 72.0
|
635
636
|
actual_scale_y = render_resolution / 72.0
|
636
|
-
logger.debug(
|
637
|
+
logger.debug(
|
638
|
+
f"Rendering page {page_index} with resolution {render_resolution} (scale: {actual_scale_x:.2f})."
|
639
|
+
)
|
637
640
|
|
638
641
|
try:
|
639
642
|
# base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
|
@@ -645,22 +648,23 @@ class HighlightingService:
|
|
645
648
|
)
|
646
649
|
if isinstance(base_image_pil, bytes):
|
647
650
|
from io import BytesIO
|
651
|
+
|
648
652
|
base_image_pil = Image.open(BytesIO(base_image_pil))
|
649
|
-
base_image_pil = base_image_pil.convert("RGBA")
|
650
|
-
logger.debug(
|
651
|
-
f"Base image for page {page_index} rendered. Size: {base_image_pil.size}."
|
652
|
-
)
|
653
|
+
base_image_pil = base_image_pil.convert("RGBA") # Ensure RGBA for renderer
|
654
|
+
logger.debug(f"Base image for page {page_index} rendered. Size: {base_image_pil.size}.")
|
653
655
|
|
654
|
-
if actual_scale_x is None or actual_scale_y is None:
|
656
|
+
if actual_scale_x is None or actual_scale_y is None: # If not set by resolution path
|
655
657
|
if page_obj.width > 0:
|
656
658
|
actual_scale_x = base_image_pil.width / page_obj.width
|
657
|
-
else:
|
658
|
-
actual_scale_x = scale
|
659
|
+
else:
|
660
|
+
actual_scale_x = scale # Fallback
|
659
661
|
if page_obj.height > 0:
|
660
662
|
actual_scale_y = base_image_pil.height / page_obj.height
|
661
663
|
else:
|
662
|
-
actual_scale_y = scale
|
663
|
-
logger.debug(
|
664
|
+
actual_scale_y = scale # Fallback
|
665
|
+
logger.debug(
|
666
|
+
f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}"
|
667
|
+
)
|
664
668
|
|
665
669
|
except IOError as e:
|
666
670
|
logger.error(f"IOError creating base image for page {page_index}: {e}")
|
@@ -668,8 +672,8 @@ class HighlightingService:
|
|
668
672
|
except AttributeError as e:
|
669
673
|
logger.error(f"AttributeError creating base image for page {page_index}: {e}")
|
670
674
|
raise
|
671
|
-
|
672
|
-
renderer_scale = actual_scale_x
|
675
|
+
|
676
|
+
renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
|
673
677
|
|
674
678
|
# --- Render Highlights ---
|
675
679
|
rendered_image: Image.Image
|
@@ -678,7 +682,7 @@ class HighlightingService:
|
|
678
682
|
page=page_obj,
|
679
683
|
base_image=base_image_pil,
|
680
684
|
highlights=highlights_on_page,
|
681
|
-
scale=renderer_scale,
|
685
|
+
scale=renderer_scale, # Use the determined actual scale
|
682
686
|
render_ocr=render_ocr,
|
683
687
|
)
|
684
688
|
rendered_image = renderer.render()
|
@@ -747,21 +751,25 @@ class HighlightingService:
|
|
747
751
|
return None
|
748
752
|
|
749
753
|
page_obj = self._pdf.pages[page_index]
|
750
|
-
|
754
|
+
|
751
755
|
to_image_args = kwargs.copy()
|
752
756
|
actual_scale_x = None
|
753
757
|
actual_scale_y = None
|
754
758
|
|
755
759
|
# Determine arguments for page._page.to_image()
|
756
760
|
if "width" in to_image_args and to_image_args["width"] is not None:
|
757
|
-
logger.debug(
|
761
|
+
logger.debug(
|
762
|
+
f"Rendering preview for page {page_index} with width={to_image_args['width']}."
|
763
|
+
)
|
758
764
|
# Resolution is implicitly handled by pdfplumber when width is set
|
759
|
-
if "height" in to_image_args:
|
765
|
+
if "height" in to_image_args:
|
760
766
|
to_image_args.pop("height", None)
|
761
767
|
# after image is created, we will calculate actual_scale_x and actual_scale_y
|
762
768
|
|
763
769
|
elif "height" in to_image_args and to_image_args["height"] is not None:
|
764
|
-
logger.debug(
|
770
|
+
logger.debug(
|
771
|
+
f"Rendering preview for page {page_index} with height={to_image_args['height']}."
|
772
|
+
)
|
765
773
|
# Resolution is implicitly handled by pdfplumber when height is set
|
766
774
|
# after image is created, we will calculate actual_scale_x and actual_scale_y
|
767
775
|
else:
|
@@ -770,7 +778,9 @@ class HighlightingService:
|
|
770
778
|
to_image_args["resolution"] = render_resolution
|
771
779
|
actual_scale_x = render_resolution / 72.0
|
772
780
|
actual_scale_y = render_resolution / 72.0
|
773
|
-
logger.debug(
|
781
|
+
logger.debug(
|
782
|
+
f"Rendering preview for page {page_index} with resolution={render_resolution} (scale: {actual_scale_x:.2f})."
|
783
|
+
)
|
774
784
|
|
775
785
|
try:
|
776
786
|
img_object = page_obj._page.to_image(**to_image_args)
|
@@ -781,6 +791,7 @@ class HighlightingService:
|
|
781
791
|
)
|
782
792
|
if isinstance(base_image_pil, bytes):
|
783
793
|
from io import BytesIO
|
794
|
+
|
784
795
|
base_image_pil = Image.open(BytesIO(base_image_pil))
|
785
796
|
base_image_pil = base_image_pil.convert("RGB")
|
786
797
|
|
@@ -789,12 +800,14 @@ class HighlightingService:
|
|
789
800
|
if page_obj.width > 0:
|
790
801
|
actual_scale_x = base_image_pil.width / page_obj.width
|
791
802
|
else:
|
792
|
-
actual_scale_x = scale
|
803
|
+
actual_scale_x = scale # Fallback to original scale
|
793
804
|
if page_obj.height > 0:
|
794
805
|
actual_scale_y = base_image_pil.height / page_obj.height
|
795
806
|
else:
|
796
|
-
actual_scale_y = scale
|
797
|
-
logger.debug(
|
807
|
+
actual_scale_y = scale # Fallback to original scale
|
808
|
+
logger.debug(
|
809
|
+
f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})"
|
810
|
+
)
|
798
811
|
|
799
812
|
# Convert temporary highlight dicts to Highlight objects
|
800
813
|
preview_highlights = []
|
@@ -828,16 +841,18 @@ class HighlightingService:
|
|
828
841
|
attributes=attrs_to_draw,
|
829
842
|
)
|
830
843
|
)
|
831
|
-
|
844
|
+
|
832
845
|
# Use the calculated actual_scale_x for the HighlightRenderer
|
833
846
|
# Assuming HighlightRenderer can handle a single scale or we adapt it.
|
834
847
|
# For now, pdfplumber usually maintains aspect ratio, so one scale should be okay.
|
835
848
|
# If not, HighlightRenderer needs to accept scale_x and scale_y.
|
836
|
-
# We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
|
849
|
+
# We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
|
837
850
|
# or if not, it's a reasonable approximation for highlight scaling.
|
838
|
-
renderer_scale = actual_scale_x
|
851
|
+
renderer_scale = actual_scale_x
|
839
852
|
|
840
|
-
renderer = HighlightRenderer(
|
853
|
+
renderer = HighlightRenderer(
|
854
|
+
page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr
|
855
|
+
)
|
841
856
|
rendered_image = renderer.render()
|
842
857
|
|
843
858
|
legend = None
|