natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +119 -76
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/describe/__init__.py +21 -0
  14. natural_pdf/describe/base.py +457 -0
  15. natural_pdf/describe/elements.py +411 -0
  16. natural_pdf/describe/mixin.py +84 -0
  17. natural_pdf/describe/summary.py +186 -0
  18. natural_pdf/elements/base.py +11 -10
  19. natural_pdf/elements/collections.py +116 -51
  20. natural_pdf/elements/region.py +204 -127
  21. natural_pdf/exporters/paddleocr.py +38 -13
  22. natural_pdf/flows/__init__.py +3 -3
  23. natural_pdf/flows/collections.py +303 -132
  24. natural_pdf/flows/element.py +277 -132
  25. natural_pdf/flows/flow.py +33 -16
  26. natural_pdf/flows/region.py +142 -79
  27. natural_pdf/ocr/engine_doctr.py +37 -4
  28. natural_pdf/ocr/engine_easyocr.py +23 -3
  29. natural_pdf/ocr/engine_paddle.py +281 -30
  30. natural_pdf/ocr/engine_surya.py +8 -3
  31. natural_pdf/ocr/ocr_manager.py +75 -76
  32. natural_pdf/ocr/ocr_options.py +52 -87
  33. natural_pdf/search/__init__.py +25 -12
  34. natural_pdf/search/lancedb_search_service.py +91 -54
  35. natural_pdf/search/numpy_search_service.py +86 -65
  36. natural_pdf/search/searchable_mixin.py +2 -2
  37. natural_pdf/selectors/parser.py +125 -81
  38. natural_pdf/widgets/__init__.py +1 -1
  39. natural_pdf/widgets/viewer.py +205 -449
  40. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
  41. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
  42. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
  43. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
  44. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,11 @@
1
1
  import logging
2
2
  from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
3
3
 
4
- # Assuming PIL is installed as it's needed for vision
5
- try:
6
- from PIL import Image
7
- except ImportError:
8
- Image = None # type: ignore
4
+ from PIL import Image
9
5
 
10
- # Import result classes
11
- from .results import ClassificationResult # Assuming results.py is in the same dir
6
+ from .results import ClassificationResult
12
7
 
13
8
  if TYPE_CHECKING:
14
- # Avoid runtime import cycle
15
9
  from natural_pdf.core.page import Page
16
10
  from natural_pdf.elements.region import Region
17
11
 
@@ -61,14 +61,16 @@ except ImportError as e:
61
61
 
62
62
  SearchServiceProtocol, SearchOptions, Indexable = object, object, object
63
63
 
64
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
65
+
64
66
  # Import the ApplyMixin
65
67
  from natural_pdf.collections.mixins import ApplyMixin
66
68
  from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
67
69
 
68
- from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
69
70
 
70
-
71
- class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin): # Add ExportMixin and ShapeDetectionMixin
71
+ class PDFCollection(
72
+ SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin
73
+ ): # Add ExportMixin and ShapeDetectionMixin
72
74
  def __init__(
73
75
  self,
74
76
  source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -120,6 +122,7 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixi
120
122
  def _get_pdf_class():
121
123
  """Helper method to dynamically import the PDF class."""
122
124
  from natural_pdf.core.pdf import PDF
125
+
123
126
  return PDF
124
127
 
125
128
  # --- Internal Helpers ---
@@ -382,33 +385,25 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixi
382
385
  pdf_path = pdf.path # Get path for logging
383
386
  logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
384
387
  start_time = time.monotonic()
385
- try:
386
- pdf.apply_ocr( # Call apply_ocr on the original PDF object
387
- pages=pages,
388
- engine=engine,
389
- languages=languages,
390
- min_confidence=min_confidence,
391
- device=device,
392
- resolution=resolution,
393
- apply_exclusions=apply_exclusions,
394
- detect_only=detect_only,
395
- replace=replace,
396
- options=options,
397
- # Note: We might want a max_workers here too for page rendering?
398
- # For now, PDF.apply_ocr doesn't have it.
399
- )
400
- end_time = time.monotonic()
401
- logger.debug(
402
- f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
403
- )
404
- return pdf_path, None
405
- except Exception as e:
406
- end_time = time.monotonic()
407
- logger.error(
408
- f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
409
- exc_info=False,
410
- )
411
- return pdf_path, e # Return path and error
388
+ pdf.apply_ocr( # Call apply_ocr on the original PDF object
389
+ pages=pages,
390
+ engine=engine,
391
+ languages=languages,
392
+ min_confidence=min_confidence,
393
+ device=device,
394
+ resolution=resolution,
395
+ apply_exclusions=apply_exclusions,
396
+ detect_only=detect_only,
397
+ replace=replace,
398
+ options=options,
399
+ # Note: We might want a max_workers here too for page rendering?
400
+ # For now, PDF.apply_ocr doesn't have it.
401
+ )
402
+ end_time = time.monotonic()
403
+ logger.debug(
404
+ f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
405
+ )
406
+ return pdf_path, None
412
407
 
413
408
  # Use ThreadPoolExecutor for parallel processing if max_workers > 1
414
409
  if max_workers is not None and max_workers > 1:
@@ -219,9 +219,7 @@ class HighlightRenderer:
219
219
  ocr_elements = self.page.find_all("text[source=ocr]")
220
220
  if not ocr_elements:
221
221
  # Don't run full OCR here, just extract if already run
222
- ocr_elements = [
223
- el for el in self.page.words if getattr(el, "source", None) == "ocr"
224
- ]
222
+ ocr_elements = [el for el in self.page.words if getattr(el, "source", None) == "ocr"]
225
223
  # Alternative: self.page.extract_ocr_elements() - but might be slow
226
224
 
227
225
  if not ocr_elements:
@@ -611,7 +609,7 @@ class HighlightingService:
611
609
  logger.error(f"Invalid page index {page_index} for rendering.")
612
610
  return None
613
611
 
614
- page_obj = self._pdf[page_index] # Renamed to avoid conflict
612
+ page_obj = self._pdf[page_index] # Renamed to avoid conflict
615
613
  highlights_on_page = self.get_highlights_for_page(page_index)
616
614
 
617
615
  to_image_args = kwargs.copy()
@@ -620,20 +618,25 @@ class HighlightingService:
620
618
 
621
619
  if "width" in to_image_args and to_image_args["width"] is not None:
622
620
  logger.debug(f"Rendering page {page_index} with width={to_image_args['width']}.")
623
- if "height" in to_image_args: to_image_args.pop("height", None)
621
+ if "height" in to_image_args:
622
+ to_image_args.pop("height", None)
624
623
  # Actual scale will be calculated after image creation
625
624
  elif "height" in to_image_args and to_image_args["height"] is not None:
626
625
  logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
627
626
  # Actual scale will be calculated after image creation
628
627
  else:
629
628
  # Use explicit resolution from kwargs if present, then the resolution param, then scale
630
- render_resolution = to_image_args.pop("resolution", resolution) # Use and remove from kwargs if present
629
+ render_resolution = to_image_args.pop(
630
+ "resolution", resolution
631
+ ) # Use and remove from kwargs if present
631
632
  if render_resolution is None:
632
633
  render_resolution = scale * 72
633
- to_image_args["resolution"] = render_resolution # Add it back for the call
634
+ to_image_args["resolution"] = render_resolution # Add it back for the call
634
635
  actual_scale_x = render_resolution / 72.0
635
636
  actual_scale_y = render_resolution / 72.0
636
- logger.debug(f"Rendering page {page_index} with resolution {render_resolution} (scale: {actual_scale_x:.2f}).")
637
+ logger.debug(
638
+ f"Rendering page {page_index} with resolution {render_resolution} (scale: {actual_scale_x:.2f})."
639
+ )
637
640
 
638
641
  try:
639
642
  # base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
@@ -645,22 +648,23 @@ class HighlightingService:
645
648
  )
646
649
  if isinstance(base_image_pil, bytes):
647
650
  from io import BytesIO
651
+
648
652
  base_image_pil = Image.open(BytesIO(base_image_pil))
649
- base_image_pil = base_image_pil.convert("RGBA") # Ensure RGBA for renderer
650
- logger.debug(
651
- f"Base image for page {page_index} rendered. Size: {base_image_pil.size}."
652
- )
653
+ base_image_pil = base_image_pil.convert("RGBA") # Ensure RGBA for renderer
654
+ logger.debug(f"Base image for page {page_index} rendered. Size: {base_image_pil.size}.")
653
655
 
654
- if actual_scale_x is None or actual_scale_y is None: # If not set by resolution path
656
+ if actual_scale_x is None or actual_scale_y is None: # If not set by resolution path
655
657
  if page_obj.width > 0:
656
658
  actual_scale_x = base_image_pil.width / page_obj.width
657
- else:
658
- actual_scale_x = scale # Fallback
659
+ else:
660
+ actual_scale_x = scale # Fallback
659
661
  if page_obj.height > 0:
660
662
  actual_scale_y = base_image_pil.height / page_obj.height
661
663
  else:
662
- actual_scale_y = scale # Fallback
663
- logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
664
+ actual_scale_y = scale # Fallback
665
+ logger.debug(
666
+ f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}"
667
+ )
664
668
 
665
669
  except IOError as e:
666
670
  logger.error(f"IOError creating base image for page {page_index}: {e}")
@@ -668,8 +672,8 @@ class HighlightingService:
668
672
  except AttributeError as e:
669
673
  logger.error(f"AttributeError creating base image for page {page_index}: {e}")
670
674
  raise
671
-
672
- renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
675
+
676
+ renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
673
677
 
674
678
  # --- Render Highlights ---
675
679
  rendered_image: Image.Image
@@ -678,7 +682,7 @@ class HighlightingService:
678
682
  page=page_obj,
679
683
  base_image=base_image_pil,
680
684
  highlights=highlights_on_page,
681
- scale=renderer_scale, # Use the determined actual scale
685
+ scale=renderer_scale, # Use the determined actual scale
682
686
  render_ocr=render_ocr,
683
687
  )
684
688
  rendered_image = renderer.render()
@@ -747,21 +751,25 @@ class HighlightingService:
747
751
  return None
748
752
 
749
753
  page_obj = self._pdf.pages[page_index]
750
-
754
+
751
755
  to_image_args = kwargs.copy()
752
756
  actual_scale_x = None
753
757
  actual_scale_y = None
754
758
 
755
759
  # Determine arguments for page._page.to_image()
756
760
  if "width" in to_image_args and to_image_args["width"] is not None:
757
- logger.debug(f"Rendering preview for page {page_index} with width={to_image_args['width']}.")
761
+ logger.debug(
762
+ f"Rendering preview for page {page_index} with width={to_image_args['width']}."
763
+ )
758
764
  # Resolution is implicitly handled by pdfplumber when width is set
759
- if "height" in to_image_args:
765
+ if "height" in to_image_args:
760
766
  to_image_args.pop("height", None)
761
767
  # after image is created, we will calculate actual_scale_x and actual_scale_y
762
768
 
763
769
  elif "height" in to_image_args and to_image_args["height"] is not None:
764
- logger.debug(f"Rendering preview for page {page_index} with height={to_image_args['height']}.")
770
+ logger.debug(
771
+ f"Rendering preview for page {page_index} with height={to_image_args['height']}."
772
+ )
765
773
  # Resolution is implicitly handled by pdfplumber when height is set
766
774
  # after image is created, we will calculate actual_scale_x and actual_scale_y
767
775
  else:
@@ -770,7 +778,9 @@ class HighlightingService:
770
778
  to_image_args["resolution"] = render_resolution
771
779
  actual_scale_x = render_resolution / 72.0
772
780
  actual_scale_y = render_resolution / 72.0
773
- logger.debug(f"Rendering preview for page {page_index} with resolution={render_resolution} (scale: {actual_scale_x:.2f}).")
781
+ logger.debug(
782
+ f"Rendering preview for page {page_index} with resolution={render_resolution} (scale: {actual_scale_x:.2f})."
783
+ )
774
784
 
775
785
  try:
776
786
  img_object = page_obj._page.to_image(**to_image_args)
@@ -781,6 +791,7 @@ class HighlightingService:
781
791
  )
782
792
  if isinstance(base_image_pil, bytes):
783
793
  from io import BytesIO
794
+
784
795
  base_image_pil = Image.open(BytesIO(base_image_pil))
785
796
  base_image_pil = base_image_pil.convert("RGB")
786
797
 
@@ -789,12 +800,14 @@ class HighlightingService:
789
800
  if page_obj.width > 0:
790
801
  actual_scale_x = base_image_pil.width / page_obj.width
791
802
  else:
792
- actual_scale_x = scale # Fallback to original scale
803
+ actual_scale_x = scale # Fallback to original scale
793
804
  if page_obj.height > 0:
794
805
  actual_scale_y = base_image_pil.height / page_obj.height
795
806
  else:
796
- actual_scale_y = scale # Fallback to original scale
797
- logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})")
807
+ actual_scale_y = scale # Fallback to original scale
808
+ logger.debug(
809
+ f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})"
810
+ )
798
811
 
799
812
  # Convert temporary highlight dicts to Highlight objects
800
813
  preview_highlights = []
@@ -828,16 +841,18 @@ class HighlightingService:
828
841
  attributes=attrs_to_draw,
829
842
  )
830
843
  )
831
-
844
+
832
845
  # Use the calculated actual_scale_x for the HighlightRenderer
833
846
  # Assuming HighlightRenderer can handle a single scale or we adapt it.
834
847
  # For now, pdfplumber usually maintains aspect ratio, so one scale should be okay.
835
848
  # If not, HighlightRenderer needs to accept scale_x and scale_y.
836
- # We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
849
+ # We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
837
850
  # or if not, it's a reasonable approximation for highlight scaling.
838
- renderer_scale = actual_scale_x
851
+ renderer_scale = actual_scale_x
839
852
 
840
- renderer = HighlightRenderer(page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr)
853
+ renderer = HighlightRenderer(
854
+ page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr
855
+ )
841
856
  rendered_image = renderer.render()
842
857
 
843
858
  legend = None