natural-pdf 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. natural_pdf/__init__.py +7 -2
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +4 -2
  8. natural_pdf/core/pdf.py +53 -38
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +203 -59
  11. natural_pdf/elements/region.py +43 -11
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +40 -61
  16. natural_pdf/exporters/hocr_font.py +7 -13
  17. natural_pdf/exporters/original_pdf.py +10 -13
  18. natural_pdf/exporters/searchable_pdf.py +0 -10
  19. natural_pdf/search/__init__.py +65 -52
  20. natural_pdf/search/lancedb_search_service.py +325 -0
  21. natural_pdf/search/numpy_search_service.py +255 -0
  22. natural_pdf/search/searchable_mixin.py +25 -71
  23. natural_pdf/widgets/viewer.py +22 -31
  24. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -49
  25. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +28 -25
  26. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  27. natural_pdf/search/haystack_search_service.py +0 -687
  28. natural_pdf/search/haystack_utils.py +0 -474
  29. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  30. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -611,13 +611,13 @@ class HighlightingService:
611
611
 
612
612
  Args:
613
613
  page_index: The 0-based index of the page to render.
614
- scale: Scale factor for rendering highlights.
614
+ scale: Scale factor for rendering highlights if width/height/resolution not in kwargs.
615
615
  labels: Whether to include a legend for highlights.
616
616
  legend_position: Position of the legend.
617
617
  render_ocr: Whether to render OCR text on the image.
618
- resolution: Optional resolution (DPI) for the base page image.
619
- Defaults to scale * 72.
620
- kwargs: Additional keyword arguments for pdfplumber's page.to_image.
618
+ resolution: Optional resolution (DPI) for the base page image if width/height not in kwargs.
619
+ Defaults to scale * 72 if not otherwise specified.
620
+ kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
621
621
 
622
622
  Returns:
623
623
  A PIL Image object of the rendered page, or None if rendering fails.
@@ -626,34 +626,81 @@ class HighlightingService:
626
626
  logger.error(f"Invalid page index {page_index} for rendering.")
627
627
  return None
628
628
 
629
- page = self._pdf[page_index]
629
+ page_obj = self._pdf[page_index] # Renamed to avoid conflict
630
630
  highlights_on_page = self.get_highlights_for_page(page_index)
631
631
 
632
- render_resolution = resolution if resolution is not None else scale * 72
633
- base_image = render_plain_page(page, render_resolution)
634
- base_image = base_image.convert("RGBA")
635
- logger.debug(
636
- f"Base image for page {page_index} rendered with resolution {render_resolution}."
637
- )
632
+ to_image_args = kwargs.copy()
633
+ actual_scale_x = None
634
+ actual_scale_y = None
635
+
636
+ if "width" in to_image_args and to_image_args["width"] is not None:
637
+ logger.debug(f"Rendering page {page_index} with width={to_image_args['width']}.")
638
+ if "height" in to_image_args: to_image_args.pop("height", None)
639
+ # Actual scale will be calculated after image creation
640
+ elif "height" in to_image_args and to_image_args["height"] is not None:
641
+ logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
642
+ # Actual scale will be calculated after image creation
643
+ else:
644
+ # Use explicit resolution from kwargs if present, then the resolution param, then scale
645
+ render_resolution = to_image_args.pop("resolution", resolution) # Use and remove from kwargs if present
646
+ if render_resolution is None:
647
+ render_resolution = scale * 72
648
+ to_image_args["resolution"] = render_resolution # Add it back for the call
649
+ actual_scale_x = render_resolution / 72.0
650
+ actual_scale_y = render_resolution / 72.0
651
+ logger.debug(f"Rendering page {page_index} with resolution {render_resolution} (scale: {actual_scale_x:.2f}).")
652
+
653
+ try:
654
+ # base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
655
+ img_object = page_obj._page.to_image(**to_image_args)
656
+ base_image_pil = (
657
+ img_object.annotated
658
+ if hasattr(img_object, "annotated")
659
+ else img_object._repr_png_()
660
+ )
661
+ if isinstance(base_image_pil, bytes):
662
+ from io import BytesIO
663
+ base_image_pil = Image.open(BytesIO(base_image_pil))
664
+ base_image_pil = base_image_pil.convert("RGBA") # Ensure RGBA for renderer
665
+ logger.debug(
666
+ f"Base image for page {page_index} rendered. Size: {base_image_pil.size}."
667
+ )
668
+
669
+ if actual_scale_x is None or actual_scale_y is None: # If not set by resolution path
670
+ if page_obj.width > 0:
671
+ actual_scale_x = base_image_pil.width / page_obj.width
672
+ else:
673
+ actual_scale_x = scale # Fallback
674
+ if page_obj.height > 0:
675
+ actual_scale_y = base_image_pil.height / page_obj.height
676
+ else:
677
+ actual_scale_y = scale # Fallback
678
+ logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
679
+
680
+ except Exception as e:
681
+ logger.error(f"Error creating base image for page {page_index}: {e}", exc_info=True)
682
+ return None
683
+
684
+ renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
638
685
 
639
686
  # --- Render Highlights ---
640
687
  rendered_image: Image.Image
641
688
  if highlights_on_page:
642
689
  renderer = HighlightRenderer(
643
- page=page,
644
- base_image=base_image,
690
+ page=page_obj,
691
+ base_image=base_image_pil,
645
692
  highlights=highlights_on_page,
646
- scale=scale,
693
+ scale=renderer_scale, # Use the determined actual scale
647
694
  render_ocr=render_ocr,
648
695
  )
649
696
  rendered_image = renderer.render()
650
697
  else:
651
698
  if render_ocr:
652
- # Still render OCR even if no highlights
653
- renderer = HighlightRenderer(page, base_image, [], scale, True)
699
+ # Still render OCR even if no highlights, using the determined actual scale
700
+ renderer = HighlightRenderer(page_obj, base_image_pil, [], renderer_scale, True)
654
701
  rendered_image = renderer.render()
655
702
  else:
656
- rendered_image = base_image # No highlights, no OCR requested
703
+ rendered_image = base_image_pil # No highlights, no OCR requested
657
704
 
658
705
  # --- Add Legend (Based ONLY on this page's highlights) ---
659
706
  if labels:
@@ -697,12 +744,12 @@ class HighlightingService:
697
744
  Args:
698
745
  page_index: Index of the page to render.
699
746
  temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
700
- scale: Scale factor for rendering.
747
+ scale: Original scale factor for rendering, used if width/height are not provided.
701
748
  labels: Whether to include a legend.
702
749
  legend_position: Position of the legend.
703
750
  render_ocr: Whether to render OCR text.
704
- resolution: Resolution for base page image rendering.
705
- **kwargs: Additional args for pdfplumber's to_image.
751
+ resolution: Resolution for base page image rendering if width/height not used.
752
+ **kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
706
753
 
707
754
  Returns:
708
755
  PIL Image of the preview, or None if rendering fails.
@@ -711,35 +758,64 @@ class HighlightingService:
711
758
  logger.error(f"Invalid page index {page_index} for render_preview.")
712
759
  return None
713
760
 
714
- page = self._pdf.pages[page_index]
715
- render_resolution = resolution if resolution is not None else scale * 72
761
+ page_obj = self._pdf.pages[page_index]
762
+
763
+ to_image_args = kwargs.copy()
764
+ actual_scale_x = None
765
+ actual_scale_y = None
766
+
767
+ # Determine arguments for page._page.to_image()
768
+ if "width" in to_image_args and to_image_args["width"] is not None:
769
+ logger.debug(f"Rendering preview for page {page_index} with width={to_image_args['width']}.")
770
+ # Resolution is implicitly handled by pdfplumber when width is set
771
+ if "height" in to_image_args:
772
+ to_image_args.pop("height", None)
773
+ # after image is created, we will calculate actual_scale_x and actual_scale_y
774
+
775
+ elif "height" in to_image_args and to_image_args["height"] is not None:
776
+ logger.debug(f"Rendering preview for page {page_index} with height={to_image_args['height']}.")
777
+ # Resolution is implicitly handled by pdfplumber when height is set
778
+ # after image is created, we will calculate actual_scale_x and actual_scale_y
779
+ else:
780
+ # Neither width nor height is provided, use resolution or scale.
781
+ render_resolution = resolution if resolution is not None else scale * 72
782
+ to_image_args["resolution"] = render_resolution
783
+ actual_scale_x = render_resolution / 72.0
784
+ actual_scale_y = render_resolution / 72.0
785
+ logger.debug(f"Rendering preview for page {page_index} with resolution={render_resolution} (scale: {actual_scale_x:.2f}).")
716
786
 
717
787
  try:
718
- # Get base image from pdfplumber using the Page object's underlying _page
719
- img_object = page._page.to_image(resolution=render_resolution, **kwargs)
720
- base_image = (
788
+ img_object = page_obj._page.to_image(**to_image_args)
789
+ base_image_pil = (
721
790
  img_object.annotated
722
791
  if hasattr(img_object, "annotated")
723
792
  else img_object._repr_png_()
724
793
  )
725
- if isinstance(base_image, bytes):
794
+ if isinstance(base_image_pil, bytes):
726
795
  from io import BytesIO
796
+ base_image_pil = Image.open(BytesIO(base_image_pil))
797
+ base_image_pil = base_image_pil.convert("RGB")
727
798
 
728
- base_image = Image.open(BytesIO(base_image))
729
- base_image = base_image.convert("RGB") # Ensure consistent format
799
+ # If scale was not determined by resolution, calculate it now from base_image_pil dimensions
800
+ if actual_scale_x is None or actual_scale_y is None:
801
+ if page_obj.width > 0:
802
+ actual_scale_x = base_image_pil.width / page_obj.width
803
+ else:
804
+ actual_scale_x = scale # Fallback to original scale
805
+ if page_obj.height > 0:
806
+ actual_scale_y = base_image_pil.height / page_obj.height
807
+ else:
808
+ actual_scale_y = scale # Fallback to original scale
809
+ logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})")
730
810
 
731
811
  # Convert temporary highlight dicts to Highlight objects
732
- # Note: Colors/labels should be determined *here* for temporary preview
733
812
  preview_highlights = []
734
813
  for hl_data in temporary_highlights:
735
- # Determine the final color using the service logic
736
814
  final_color = self._determine_highlight_color(
737
815
  color_input=hl_data.get("color"),
738
816
  label=hl_data.get("label"),
739
817
  use_color_cycling=hl_data.get("use_color_cycling", False),
740
818
  )
741
-
742
- # Extract potential attributes to draw
743
819
  attrs_to_draw = {}
744
820
  element = hl_data.get("element")
745
821
  include_attrs = hl_data.get("include_attrs")
@@ -753,25 +829,29 @@ class HighlightingService:
753
829
  logger.warning(
754
830
  f"Attribute '{attr_name}' not found on element {element}"
755
831
  )
756
-
757
- # Add highlight if geometry exists
758
832
  if hl_data.get("bbox") or hl_data.get("polygon"):
759
833
  preview_highlights.append(
760
834
  Highlight(
761
835
  page_index=hl_data["page_index"],
762
836
  bbox=hl_data.get("bbox"),
763
837
  polygon=hl_data.get("polygon"),
764
- color=final_color, # Use the determined color
838
+ color=final_color,
765
839
  label=hl_data.get("label"),
766
840
  attributes=attrs_to_draw,
767
841
  )
768
842
  )
769
-
770
- # Render only these highlights
771
- renderer = HighlightRenderer(page, base_image, preview_highlights, scale, render_ocr)
843
+
844
+ # Use the calculated actual_scale_x for the HighlightRenderer
845
+ # Assuming HighlightRenderer can handle a single scale or we adapt it.
846
+ # For now, pdfplumber usually maintains aspect ratio, so one scale should be okay.
847
+ # If not, HighlightRenderer needs to accept scale_x and scale_y.
848
+ # We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
849
+ # or if not, it's a reasonable approximation for highlight scaling.
850
+ renderer_scale = actual_scale_x
851
+
852
+ renderer = HighlightRenderer(page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr)
772
853
  rendered_image = renderer.render()
773
854
 
774
- # Create legend only from temporary highlights
775
855
  legend = None
776
856
  if labels:
777
857
  preview_labels = {h.label: h.color for h in preview_highlights if h.label}
@@ -781,7 +861,7 @@ class HighlightingService:
781
861
  rendered_image, legend, position=legend_position
782
862
  )
783
863
  else:
784
- final_image = rendered_image # No legend needed
864
+ final_image = rendered_image
785
865
  else:
786
866
  final_image = rendered_image
787
867
 
natural_pdf/core/page.py CHANGED
@@ -1349,7 +1349,9 @@ class Page(ClassificationMixin, ExtractionMixin):
1349
1349
  self._highlighter.clear_page(self.index)
1350
1350
  return self
1351
1351
 
1352
- def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> "ElementCollection":
1352
+ def analyze_text_styles(
1353
+ self, options: Optional[TextStyleOptions] = None
1354
+ ) -> "ElementCollection":
1353
1355
  """
1354
1356
  Analyze text elements by style, adding attributes directly to elements.
1355
1357
 
@@ -2130,7 +2132,7 @@ class Page(ClassificationMixin, ExtractionMixin):
2130
2132
  if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
2131
2133
  logger.error(
2132
2134
  "Interactive viewer requires optional dependencies ('ipywidgets'). "
2133
- "Install with `pip install natural-pdf[interactive]`"
2135
+ "Install with `pip install natural-pdf[viewer]`"
2134
2136
  )
2135
2137
  # raise ImportError("ipywidgets not found.") # Option 1: Raise error
2136
2138
  return None # Option 2: Return None gracefully
natural_pdf/core/pdf.py CHANGED
@@ -60,6 +60,7 @@ except ImportError:
60
60
  "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
61
61
  )
62
62
 
63
+
63
64
  try:
64
65
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
65
66
  except ImportError:
@@ -791,10 +792,10 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
791
792
  "PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
792
793
  )
793
794
  if create_searchable_pdf is None:
794
- raise ImportError(
795
- "Saving searchable PDF requires 'pikepdf' and 'Pillow'. "
796
- "Install with: pip install \"natural-pdf[ocr-export]\""
797
- )
795
+ raise ImportError(
796
+ "Saving searchable PDF requires 'pikepdf'. "
797
+ 'Install with: pip install "natural-pdf[ocr-export]"'
798
+ )
798
799
  output_path_str = str(output_path)
799
800
  # Call the exporter directly, passing self (the PDF instance)
800
801
  create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
@@ -842,55 +843,59 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
842
843
  output_path_str = str(output_path_obj)
843
844
 
844
845
  if ocr:
845
- if create_searchable_pdf is None:
846
- raise ImportError(
847
- "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
848
- "Install with: pip install \"natural-pdf[ocr-export]\""
849
- )
850
-
851
- # Optional: Add warning about vector data loss similar to PageCollection
852
846
  has_vector_elements = False
853
847
  for page in self.pages:
854
- if (hasattr(page, 'rects') and page.rects or
855
- hasattr(page, 'lines') and page.lines or
856
- hasattr(page, 'curves') and page.curves or
857
- (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
858
- (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
848
+ if (
849
+ hasattr(page, "rects")
850
+ and page.rects
851
+ or hasattr(page, "lines")
852
+ and page.lines
853
+ or hasattr(page, "curves")
854
+ and page.curves
855
+ or (
856
+ hasattr(page, "chars")
857
+ and any(getattr(el, "source", None) != "ocr" for el in page.chars)
858
+ )
859
+ or (
860
+ hasattr(page, "words")
861
+ and any(getattr(el, "source", None) != "ocr" for el in page.words)
862
+ )
863
+ ):
859
864
  has_vector_elements = True
860
865
  break
861
866
  if has_vector_elements:
862
- logger.warning(
863
- "Warning: Saving with ocr=True creates an image-based PDF. "
864
- "Original vector elements (rects, lines, non-OCR text/chars) "
865
- "will not be preserved in the output file."
866
- )
867
+ logger.warning(
868
+ "Warning: Saving with ocr=True creates an image-based PDF. "
869
+ "Original vector elements (rects, lines, non-OCR text/chars) "
870
+ "will not be preserved in the output file."
871
+ )
867
872
 
868
873
  logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
869
874
  try:
870
875
  # Delegate to the searchable PDF exporter, passing self (PDF instance)
871
876
  create_searchable_pdf(self, output_path_str, dpi=dpi)
872
877
  except Exception as e:
873
- raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
878
+ raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
874
879
 
875
880
  elif original:
876
881
  if create_original_pdf is None:
877
882
  raise ImportError(
878
883
  "Saving with original=True requires 'pikepdf'. "
879
- "Install with: pip install \"natural-pdf[ocr-export]\""
884
+ 'Install with: pip install "natural-pdf[ocr-export]"'
880
885
  )
881
886
 
882
- # Optional: Add warning about losing OCR data similar to PageCollection
887
+ # Optional: Add warning about losing OCR data similar to PageCollection
883
888
  has_ocr_elements = False
884
889
  for page in self.pages:
885
- if hasattr(page, 'find_all'):
886
- ocr_text_elements = page.find_all("text[source=ocr]")
887
- if ocr_text_elements:
888
- has_ocr_elements = True
889
- break
890
- elif hasattr(page, 'words'): # Fallback
891
- if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
892
- has_ocr_elements = True
893
- break
890
+ if hasattr(page, "find_all"):
891
+ ocr_text_elements = page.find_all("text[source=ocr]")
892
+ if ocr_text_elements:
893
+ has_ocr_elements = True
894
+ break
895
+ elif hasattr(page, "words"): # Fallback
896
+ if any(getattr(el, "source", None) == "ocr" for el in page.words):
897
+ has_ocr_elements = True
898
+ break
894
899
  if has_ocr_elements:
895
900
  logger.warning(
896
901
  "Warning: Saving with original=True preserves original page content. "
@@ -899,11 +904,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
899
904
 
900
905
  logger.info(f"Saving original PDF content to: {output_path_str}")
901
906
  try:
902
- # Delegate to the original PDF exporter, passing self (PDF instance)
903
- create_original_pdf(self, output_path_str)
907
+ # Delegate to the original PDF exporter, passing self (PDF instance)
908
+ create_original_pdf(self, output_path_str)
904
909
  except Exception as e:
905
- # Re-raise exception from exporter
906
- raise e
910
+ # Re-raise exception from exporter
911
+ raise e
907
912
 
908
913
  def ask(
909
914
  self,
@@ -1227,6 +1232,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1227
1232
  """Context manager exit."""
1228
1233
  self.close()
1229
1234
 
1235
+ def __repr__(self) -> str:
1236
+ """Return a string representation of the PDF object."""
1237
+ if not hasattr(self, "_pages"):
1238
+ page_count_str = "uninitialized"
1239
+ else:
1240
+ page_count_str = str(len(self._pages))
1241
+
1242
+ source_info = getattr(self, "source_path", "unknown source")
1243
+ return f"<PDF source='{source_info}' pages={page_count_str}>"
1244
+
1230
1245
  def get_id(self) -> str:
1231
1246
  """Get unique identifier for this PDF."""
1232
1247
  """Get unique identifier for this PDF."""
@@ -1400,7 +1415,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1400
1415
  except ImportError:
1401
1416
  raise ImportError(
1402
1417
  "Classification dependencies missing. "
1403
- 'Install with: pip install "natural-pdf[classification]"'
1418
+ 'Install with: pip install "natural-pdf[core-ml]"'
1404
1419
  )
1405
1420
  raise ClassificationError("ClassificationManager not available.")
1406
1421
 
@@ -814,6 +814,7 @@ class Element(DirectionalMixin):
814
814
  legend_position: str = "right",
815
815
  color: Optional[Union[Tuple, str]] = "red", # Default color for single element
816
816
  label: Optional[str] = None,
817
+ width: Optional[int] = None, # Add width parameter
817
818
  ) -> Optional["Image.Image"]:
818
819
  """
819
820
  Show the page with only this element highlighted temporarily.
@@ -824,6 +825,7 @@ class Element(DirectionalMixin):
824
825
  legend_position: Position of the legend
825
826
  color: Color to highlight this element (default: red)
826
827
  label: Optional label for this element in the legend
828
+ width: Optional width for the output image in pixels
827
829
 
828
830
  Returns:
829
831
  PIL Image of the page with only this element highlighted, or None if error.
@@ -861,6 +863,7 @@ class Element(DirectionalMixin):
861
863
  page_index=self.page.index,
862
864
  temporary_highlights=[temp_highlight_data],
863
865
  scale=scale,
866
+ width=width, # Pass the width parameter
864
867
  labels=labels,
865
868
  legend_position=legend_position,
866
869
  )
@@ -898,6 +901,7 @@ class Element(DirectionalMixin):
898
901
  self,
899
902
  *,
900
903
  text: str,
904
+ contains: str = "all",
901
905
  apply_exclusions: bool = True,
902
906
  regex: bool = False,
903
907
  case: bool = True,
@@ -909,6 +913,7 @@ class Element(DirectionalMixin):
909
913
  self,
910
914
  selector: str,
911
915
  *,
916
+ contains: str = "all",
912
917
  apply_exclusions: bool = True,
913
918
  regex: bool = False,
914
919
  case: bool = True,
@@ -920,6 +925,7 @@ class Element(DirectionalMixin):
920
925
  selector: Optional[str] = None,
921
926
  *,
922
927
  text: Optional[str] = None,
928
+ contains: str = "all",
923
929
  apply_exclusions: bool = True,
924
930
  regex: bool = False,
925
931
  case: bool = True,
@@ -934,6 +940,9 @@ class Element(DirectionalMixin):
934
940
  Args:
935
941
  selector: CSS-like selector string.
936
942
  text: Text content to search for (equivalent to 'text:contains(...)').
943
+ contains: How to determine if elements are inside: 'all' (fully inside),
944
+ 'any' (any overlap), or 'center' (center point inside).
945
+ (default: "all")
937
946
  apply_exclusions: Whether to apply exclusion regions (default: True).
938
947
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
939
948
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -950,6 +959,7 @@ class Element(DirectionalMixin):
950
959
  return temp_region.find(
951
960
  selector=selector,
952
961
  text=text,
962
+ contains=contains,
953
963
  apply_exclusions=apply_exclusions,
954
964
  regex=regex,
955
965
  case=case,
@@ -961,6 +971,7 @@ class Element(DirectionalMixin):
961
971
  self,
962
972
  *,
963
973
  text: str,
974
+ contains: str = "all",
964
975
  apply_exclusions: bool = True,
965
976
  regex: bool = False,
966
977
  case: bool = True,
@@ -972,6 +983,7 @@ class Element(DirectionalMixin):
972
983
  self,
973
984
  selector: str,
974
985
  *,
986
+ contains: str = "all",
975
987
  apply_exclusions: bool = True,
976
988
  regex: bool = False,
977
989
  case: bool = True,
@@ -983,6 +995,7 @@ class Element(DirectionalMixin):
983
995
  selector: Optional[str] = None,
984
996
  *,
985
997
  text: Optional[str] = None,
998
+ contains: str = "all",
986
999
  apply_exclusions: bool = True,
987
1000
  regex: bool = False,
988
1001
  case: bool = True,
@@ -997,6 +1010,9 @@ class Element(DirectionalMixin):
997
1010
  Args:
998
1011
  selector: CSS-like selector string.
999
1012
  text: Text content to search for (equivalent to 'text:contains(...)').
1013
+ contains: How to determine if elements are inside: 'all' (fully inside),
1014
+ 'any' (any overlap), or 'center' (center point inside).
1015
+ (default: "all")
1000
1016
  apply_exclusions: Whether to apply exclusion regions (default: True).
1001
1017
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1002
1018
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1013,6 +1029,7 @@ class Element(DirectionalMixin):
1013
1029
  return temp_region.find_all(
1014
1030
  selector=selector,
1015
1031
  text=text,
1032
+ contains=contains,
1016
1033
  apply_exclusions=apply_exclusions,
1017
1034
  regex=regex,
1018
1035
  case=case,