PyPI - natural-pdf - Versions diffs - 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl - Mend

natural-pdf 0.2.18py3-none-any.whl → 0.2.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

natural_pdf/__init__.py +8 -0
natural_pdf/analyzers/checkbox/__init__.py +6 -0
natural_pdf/analyzers/checkbox/base.py +265 -0
natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
natural_pdf/analyzers/checkbox/mixin.py +95 -0
natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
natural_pdf/analyzers/guides.py +26 -2
natural_pdf/collections/mixins.py +14 -5
natural_pdf/core/element_manager.py +5 -1
natural_pdf/core/page.py +61 -0
natural_pdf/core/page_collection.py +41 -1
natural_pdf/core/pdf.py +24 -1
natural_pdf/describe/base.py +20 -0
natural_pdf/elements/base.py +152 -10
natural_pdf/elements/element_collection.py +41 -2
natural_pdf/elements/region.py +115 -2
natural_pdf/judge.py +1509 -0
natural_pdf/selectors/parser.py +42 -1
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/METADATA +1 -1
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/RECORD +42 -18
temp/check_model.py +49 -0
temp/check_pdf_content.py +9 -0
temp/checkbox_checks.py +590 -0
temp/checkbox_simple.py +117 -0
temp/checkbox_ux_ideas.py +400 -0
temp/context_manager_prototype.py +177 -0
temp/convert_to_hf.py +60 -0
temp/demo_text_closest.py +66 -0
temp/inspect_model.py +43 -0
temp/rtdetr_dinov2_test.py +49 -0
temp/test_closest_debug.py +26 -0
temp/test_closest_debug2.py +22 -0
temp/test_context_exploration.py +85 -0
temp/test_durham.py +30 -0
temp/test_empty_string.py +16 -0
temp/test_similarity.py +15 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/top_level.txt +0 -0

natural_pdf/core/page.py CHANGED Viewed

@@ -50,6 +50,7 @@ import numpy as np
 from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
+from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
 from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
 from natural_pdf.analyzers.layout.layout_manager import LayoutManager
 from natural_pdf.analyzers.layout.layout_options import LayoutOptions
@@ -103,6 +104,7 @@ class Page(
     ClassificationMixin,
     ExtractionMixin,
     ShapeDetectionMixin,
+    CheckboxDetectionMixin,
     DescribeMixin,
     VisualSearchMixin,
     Visualizable,
@@ -1491,6 +1493,65 @@ class Page(
                     "Cannot sort elements in reading order: Missing required attributes (top, x0)."
                 )
+        # Handle :closest pseudo-class for fuzzy text matching
+        for pseudo in selector_obj.get("pseudo_classes", []):
+            name = pseudo.get("name")
+            if name == "closest" and pseudo.get("args") is not None:
+                import difflib
+                # Parse search text and threshold
+                search_text = str(pseudo["args"]).strip()
+                threshold = 0.0  # Default threshold
+                # Handle empty search text
+                if not search_text:
+                    matching_elements = []
+                    break
+                # Check if threshold is specified with @ separator
+                if "@" in search_text and search_text.count("@") == 1:
+                    text_part, threshold_part = search_text.rsplit("@", 1)
+                    try:
+                        threshold = float(threshold_part)
+                        search_text = text_part.strip()
+                    except (ValueError, TypeError):
+                        pass  # Keep original search_text and default threshold
+                # Determine case sensitivity
+                ignore_case = not kwargs.get("case", False)
+                # Calculate similarity scores for all elements
+                scored_elements = []
+                for el in matching_elements:
+                    if hasattr(el, "text") and el.text:
+                        el_text = el.text.strip()
+                        search_term = search_text
+                        if ignore_case:
+                            el_text = el_text.lower()
+                            search_term = search_term.lower()
+                        # Calculate similarity ratio
+                        ratio = difflib.SequenceMatcher(None, search_term, el_text).ratio()
+                        # Check if element contains the search term as substring
+                        contains_match = search_term in el_text
+                        # Store element with its similarity score and contains flag
+                        if ratio >= threshold:
+                            scored_elements.append((ratio, contains_match, el))
+                # Sort by:
+                # 1. Contains match (True before False)
+                # 2. Similarity score (highest first)
+                # This ensures substring matches come first but are sorted by similarity
+                scored_elements.sort(key=lambda x: (x[1], x[0]), reverse=True)
+                # Extract just the elements
+                matching_elements = [el for _, _, el in scored_elements]
+                break  # Only process the first :closest pseudo-class
         # Handle collection-level pseudo-classes (:first, :last)
         for pseudo in selector_obj.get("pseudo_classes", []):
             name = pseudo.get("name")

natural_pdf/core/page_collection.py CHANGED Viewed

@@ -28,6 +28,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
 from PIL import Image, ImageDraw, ImageFont
 from tqdm.auto import tqdm
+from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
 from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
 from natural_pdf.classification.manager import ClassificationManager
 from natural_pdf.classification.mixin import ClassificationMixin
@@ -76,7 +77,9 @@ T = TypeVar("T")
 P = TypeVar("P", bound="Page")
-class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Visualizable):
+class PageCollection(
+    TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, CheckboxDetectionMixin, Visualizable
+):
     """
     Represents a collection of Page objects, often from a single PDF document.
     Provides methods for batch operations on these pages.
@@ -1506,6 +1509,43 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         return ElementCollection(all_regions)
+    def detect_checkboxes(self, *args, **kwargs) -> "ElementCollection[Region]":
+        """
+        Detects checkboxes on each page in the collection.
+        This method iterates through each page, calls its detect_checkboxes method,
+        and returns a single ElementCollection containing all detected checkbox
+        regions from all pages.
+        Args:
+            *args: Positional arguments to pass to each page's detect_checkboxes method.
+            **kwargs: Keyword arguments to pass to each page's detect_checkboxes method.
+                      A 'show_progress' kwarg can be included to show a progress bar.
+        Returns:
+            An ElementCollection of all detected checkbox Region objects.
+        """
+        all_checkboxes = []
+        show_progress = kwargs.pop("show_progress", True)
+        iterator = self.pages
+        if show_progress:
+            try:
+                from tqdm.auto import tqdm
+                iterator = tqdm(self.pages, desc="Detecting checkboxes")
+            except ImportError:
+                pass  # tqdm not installed
+        for page in iterator:
+            # Each page's detect_checkboxes method returns an ElementCollection
+            checkbox_collection = page.detect_checkboxes(*args, **kwargs)
+            if checkbox_collection:
+                all_checkboxes.extend(checkbox_collection.elements)
+        return ElementCollection(all_checkboxes)
     def highlights(self, show: bool = False) -> "HighlightContext":
         """
         Create a highlight context for accumulating highlights.

natural_pdf/core/pdf.py CHANGED Viewed

@@ -27,6 +27,7 @@ from typing import (
 import pdfplumber
 from tqdm.auto import tqdm
+from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
 from natural_pdf.analyzers.layout.layout_manager import LayoutManager
 from natural_pdf.classification.manager import ClassificationError
 from natural_pdf.classification.mixin import ClassificationMixin
@@ -303,7 +304,13 @@ class _LazyPageList(Sequence):
 class PDF(
-    TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, VisualSearchMixin, Visualizable
+    TextMixin,
+    ExtractionMixin,
+    ExportMixin,
+    ClassificationMixin,
+    CheckboxDetectionMixin,
+    VisualSearchMixin,
+    Visualizable,
 ):
     """Enhanced PDF wrapper built on top of pdfplumber.
@@ -2552,6 +2559,22 @@ class PDF(
         """
         return self.pages.analyze_layout(*args, **kwargs)
+    def detect_checkboxes(self, *args, **kwargs) -> "ElementCollection[Region]":
+        """
+        Detects checkboxes on all pages in the PDF.
+        This is a convenience method that calls detect_checkboxes on the PDF's
+        page collection.
+        Args:
+            *args: Positional arguments passed to pages.detect_checkboxes().
+            **kwargs: Keyword arguments passed to pages.detect_checkboxes().
+        Returns:
+            An ElementCollection of all detected checkbox Region objects.
+        """
+        return self.pages.detect_checkboxes(*args, **kwargs)
     def highlights(self, show: bool = False) -> "HighlightContext":
         """
         Create a highlight context for accumulating highlights.

natural_pdf/describe/base.py CHANGED Viewed

@@ -233,6 +233,17 @@ def inspect_collection(collection: "ElementCollection", limit: int = 30) -> Insp
         # Get appropriate columns for this type
         columns = _get_columns_for_type(element_type, show_page_column)
+        # Add checkbox state column if we have checkbox regions
+        if element_type == "region" and any(
+            getattr(e, "region_type", "") == "checkbox" for e in display_elements
+        ):
+            # Insert state column after type column
+            if "type" in columns:
+                type_idx = columns.index("type")
+                columns.insert(type_idx + 1, "state")
+            else:
+                columns.append("state")
         # Extract data for each element
         element_data = []
         for element in display_elements:
@@ -423,6 +434,15 @@ def _extract_element_value(element: "Element", column: str) -> Any:
             value = getattr(element, column, False)
             return value if isinstance(value, bool) else False
+        elif column == "state":
+            # For checkbox regions, show checked/unchecked state
+            if getattr(element, "region_type", "") == "checkbox":
+                if hasattr(element, "is_checked"):
+                    return "checked" if element.is_checked else "unchecked"
+                elif hasattr(element, "checkbox_state"):
+                    return element.checkbox_state
+            return ""
         else:
             # Generic attribute access
             value = getattr(element, column, "")

natural_pdf/elements/base.py CHANGED Viewed

@@ -122,6 +122,8 @@ class DirectionalMixin:
         offset: float = 0.0,
         apply_exclusions: bool = True,
         multipage: bool = False,
+        within: Optional["Region"] = None,
+        anchor: str = "start",
         **kwargs,
     ) -> Union["Region", "FlowRegion"]:
         """
@@ -136,6 +138,9 @@ class DirectionalMixin:
             include_endpoint: Whether to include the boundary element found by 'until'
             offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
             apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
+            multipage: If True, allows the region to span multiple pages
+            within: Optional region to constrain the result to (default: None)
+            anchor: Reference point - 'start', 'center', 'end', or explicit edges like 'top', 'bottom', 'left', 'right'
             **kwargs: Additional parameters for the 'until' selector search
         Returns:
@@ -147,6 +152,37 @@ class DirectionalMixin:
         is_positive = direction in ("right", "below")  # right/below are positive directions
         pixel_offset = offset  # Use provided offset for excluding elements/endpoints
+        # Normalize anchor parameter
+        def normalize_anchor(anchor_value: str, dir: str) -> str:
+            """Convert start/end/center to explicit edges based on direction."""
+            if anchor_value == "center":
+                return "center"
+            elif anchor_value == "start":
+                # Start means the edge we're moving away from
+                if dir == "below":
+                    return "top"
+                elif dir == "above":
+                    return "bottom"
+                elif dir == "right":
+                    return "left"
+                elif dir == "left":
+                    return "right"
+            elif anchor_value == "end":
+                # End means the edge we're moving towards
+                if dir == "below":
+                    return "bottom"
+                elif dir == "above":
+                    return "top"
+                elif dir == "right":
+                    return "right"
+                elif dir == "left":
+                    return "left"
+            else:
+                # Already explicit (top/bottom/left/right)
+                return anchor_value
+        normalized_anchor = normalize_anchor(anchor, direction)
         # 1. Determine initial boundaries based on direction and include_source
         if is_horizontal:
             # Initial cross-boundaries (vertical)
@@ -200,34 +236,84 @@ class DirectionalMixin:
         if until:
             from natural_pdf.elements.element_collection import ElementCollection
+            # Get constraint region (from parameter or global options)
+            constraint_region = within or natural_pdf.options.layout.directional_within
+            # Check if until uses :closest selector (preserve ordering)
+            preserve_order = isinstance(until, str) and ":closest" in until
             # If until is an elementcollection, just use it
             if isinstance(until, ElementCollection):
                 # Only take ones on the same page
                 all_matches = [m for m in until if m.page == self.page]
             else:
-                all_matches = self.page.find_all(until, apply_exclusions=apply_exclusions, **kwargs)
+                # If we have a constraint region, search within it instead of the whole page
+                if (
+                    constraint_region
+                    and hasattr(constraint_region, "page")
+                    and constraint_region.page == self.page
+                ):
+                    all_matches = constraint_region.find_all(
+                        until, apply_exclusions=apply_exclusions, **kwargs
+                    )
+                else:
+                    all_matches = self.page.find_all(
+                        until, apply_exclusions=apply_exclusions, **kwargs
+                    )
             matches_in_direction = []
-            # Filter and sort matches based on direction
+            # Filter and sort matches based on direction and anchor parameter
             # Also filter by cross-direction bounds when cross_size='element'
+            # IMPORTANT: Exclude self from matches to prevent finding ourselves
+            all_matches = [m for m in all_matches if m is not self]
+            # Determine reference point based on normalized_anchor
             if direction == "above":
-                matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
+                if normalized_anchor == "top":
+                    ref_y = self.top
+                elif normalized_anchor == "center":
+                    ref_y = (self.top + self.bottom) / 2
+                else:  # 'bottom'
+                    ref_y = self.bottom
+                matches_in_direction = [m for m in all_matches if m.bottom <= ref_y]
                 # Filter by horizontal bounds if cross_size='element'
                 if cross_size == "element":
                     matches_in_direction = [
                         m for m in matches_in_direction if m.x0 < self.x1 and m.x1 > self.x0
                     ]
-                matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
+                # Only sort by position if not using :closest (which is already sorted by quality)
+                if not preserve_order:
+                    matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
             elif direction == "below":
-                matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
+                if normalized_anchor == "top":
+                    ref_y = self.top
+                elif normalized_anchor == "center":
+                    ref_y = (self.top + self.bottom) / 2
+                else:  # 'bottom'
+                    ref_y = self.bottom
+                matches_in_direction = [m for m in all_matches if m.top >= ref_y]
                 # Filter by horizontal bounds if cross_size='element'
                 if cross_size == "element":
                     matches_in_direction = [
                         m for m in matches_in_direction if m.x0 < self.x1 and m.x1 > self.x0
                     ]
-                matches_in_direction.sort(key=lambda e: e.top)
+                # Only sort by position if not using :closest (which is already sorted by quality)
+                if not preserve_order:
+                    matches_in_direction.sort(key=lambda e: e.top)
             elif direction == "left":
-                matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
+                if normalized_anchor == "left":
+                    ref_x = self.x0
+                elif normalized_anchor == "center":
+                    ref_x = (self.x0 + self.x1) / 2
+                else:  # 'right'
+                    ref_x = self.x1
+                matches_in_direction = [m for m in all_matches if m.x1 <= ref_x]
                 # Filter by vertical bounds if cross_size='element'
                 if cross_size == "element":
                     matches_in_direction = [
@@ -235,9 +321,19 @@ class DirectionalMixin:
                         for m in matches_in_direction
                         if m.top < self.bottom and m.bottom > self.top
                     ]
-                matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
+                # Only sort by position if not using :closest (which is already sorted by quality)
+                if not preserve_order:
+                    matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
             elif direction == "right":
-                matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
+                if normalized_anchor == "left":
+                    ref_x = self.x0
+                elif normalized_anchor == "center":
+                    ref_x = (self.x0 + self.x1) / 2
+                else:  # 'right'
+                    ref_x = self.x1
+                matches_in_direction = [m for m in all_matches if m.x0 >= ref_x]
                 # Filter by vertical bounds if cross_size='element'
                 if cross_size == "element":
                     matches_in_direction = [
@@ -245,7 +341,9 @@ class DirectionalMixin:
                         for m in matches_in_direction
                         if m.top < self.bottom and m.bottom > self.top
                     ]
-                matches_in_direction.sort(key=lambda e: e.x0)
+                # Only sort by position if not using :closest (which is already sorted by quality)
+                if not preserve_order:
+                    matches_in_direction.sort(key=lambda e: e.x0)
             if matches_in_direction:
                 target = matches_in_direction[0]
@@ -284,6 +382,22 @@ class DirectionalMixin:
         final_y1 = max(bbox[1], bbox[3])
         final_bbox = (final_x0, final_y0, final_x1, final_y1)
+        # 4.5. Apply within constraint if provided (or from global options)
+        constraint_region = within or natural_pdf.options.layout.directional_within
+        if constraint_region:
+            # Ensure constraint is on same page
+            if hasattr(constraint_region, "page") and constraint_region.page != self.page:
+                raise ValueError("within constraint must be on the same page as the source element")
+            # Apply constraint by intersecting with the constraint region's bounds
+            final_x0 = max(final_x0, constraint_region.x0)
+            final_y0 = max(final_y0, constraint_region.top)
+            final_x1 = min(final_x1, constraint_region.x1)
+            final_y1 = min(final_y1, constraint_region.bottom)
+            # Update final_bbox with constrained values
+            final_bbox = (final_x0, final_y0, final_x1, final_y1)
         # 5. Check if multipage is needed
         # Use global default if not explicitly set
         use_multipage = multipage
@@ -291,6 +405,10 @@ class DirectionalMixin:
         if not multipage and natural_pdf.options.layout.auto_multipage:
             use_multipage = True
+        # Multipage is not supported with within constraint
+        if use_multipage and constraint_region:
+            raise ValueError("multipage navigation is not supported with within constraint")
         # Prevent recursion: if called with internal flag, don't use multipage
         if kwargs.get("_from_flow", False):
             use_multipage = False
@@ -488,6 +606,8 @@ class DirectionalMixin:
         offset: Optional[float] = None,
         apply_exclusions: bool = True,
         multipage: bool = False,
+        within: Optional["Region"] = None,
+        anchor: str = "start",
         **kwargs,
     ) -> Union["Region", "FlowRegion"]:
         """
@@ -503,6 +623,8 @@ class DirectionalMixin:
             apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
             multipage: If True, allows the region to span multiple pages. Returns FlowRegion
                      if the result spans multiple pages, Region otherwise (default: False)
+            within: Optional region to constrain the result to (default: None)
+            anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'top', 'bottom'
             **kwargs: Additional parameters
         Returns:
@@ -534,6 +656,8 @@ class DirectionalMixin:
             offset=offset,
             apply_exclusions=apply_exclusions,
             multipage=multipage,
+            within=within,
+            anchor=anchor,
             **kwargs,
         )
@@ -547,6 +671,8 @@ class DirectionalMixin:
         offset: Optional[float] = None,
         apply_exclusions: bool = True,
         multipage: bool = False,
+        within: Optional["Region"] = None,
+        anchor: str = "start",
         **kwargs,
     ) -> Union["Region", "FlowRegion"]:
         """
@@ -562,6 +688,8 @@ class DirectionalMixin:
                      if the result spans multiple pages, Region otherwise (default: False)
             offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
             apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
+            within: Optional region to constrain the result to (default: None)
+            anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'top', 'bottom'
             **kwargs: Additional parameters
         Returns:
@@ -593,6 +721,8 @@ class DirectionalMixin:
             offset=offset,
             apply_exclusions=apply_exclusions,
             multipage=multipage,
+            within=within,
+            anchor=anchor,
             **kwargs,
         )
@@ -606,6 +736,8 @@ class DirectionalMixin:
         offset: Optional[float] = None,
         apply_exclusions: bool = True,
         multipage: bool = False,
+        within: Optional["Region"] = None,
+        anchor: str = "start",
         **kwargs,
     ) -> Union["Region", "FlowRegion"]:
         """
@@ -621,6 +753,8 @@ class DirectionalMixin:
             apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
             multipage: If True, allows the region to span multiple pages. Returns FlowRegion
                      if the result spans multiple pages, Region otherwise (default: False)
+            within: Optional region to constrain the result to (default: None)
+            anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'left', 'right'
             **kwargs: Additional parameters
         Returns:
@@ -652,6 +786,8 @@ class DirectionalMixin:
             offset=offset,
             apply_exclusions=apply_exclusions,
             multipage=multipage,
+            within=within,
+            anchor=anchor,
             **kwargs,
         )
@@ -665,6 +801,8 @@ class DirectionalMixin:
         offset: Optional[float] = None,
         apply_exclusions: bool = True,
         multipage: bool = False,
+        within: Optional["Region"] = None,
+        anchor: str = "start",
         **kwargs,
     ) -> Union["Region", "FlowRegion"]:
         """
@@ -680,6 +818,8 @@ class DirectionalMixin:
             apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
             multipage: If True, allows the region to span multiple pages. Returns FlowRegion
                      if the result spans multiple pages, Region otherwise (default: False)
+            within: Optional region to constrain the result to (default: None)
+            anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'left', 'right'
             **kwargs: Additional parameters
         Returns:
@@ -711,6 +851,8 @@ class DirectionalMixin:
             offset=offset,
             apply_exclusions=apply_exclusions,
             multipage=multipage,
+            within=within,
+            anchor=anchor,
             **kwargs,
         )

natural_pdf/elements/element_collection.py CHANGED Viewed

@@ -28,6 +28,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
 from PIL import Image, ImageDraw, ImageFont
 from tqdm.auto import tqdm
+from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
 from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
 from natural_pdf.classification.manager import ClassificationManager
 from natural_pdf.classification.mixin import ClassificationMixin
@@ -83,6 +84,7 @@ class ElementCollection(
     ApplyMixin,
     ExportMixin,
     ClassificationMixin,
+    CheckboxDetectionMixin,
     DirectionalCollectionMixin,
     DescribeMixin,
     InspectMixin,
@@ -839,7 +841,6 @@ class ElementCollection(
                 result = " ".join(c.get("text", "") for c in all_char_dicts)
         else:
-            print("JOIN WITHOUT LAYOUT")
             # Default: Simple join without layout
             logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
             result = separator.join(el.extract_text() for el in text_elements)
@@ -3344,7 +3345,45 @@ class ElementCollection(
         # Use collection's apply helper for optional progress bar
         self.apply(_process, show_progress=show_progress)
-        return self
+    def detect_checkboxes(
+        self, *args, show_progress: bool = False, **kwargs
+    ) -> "ElementCollection":
+        """
+        Detect checkboxes on all applicable elements in the collection.
+        This method iterates through elements and calls detect_checkboxes on those
+        that support it (Pages and Regions).
+        Args:
+            *args: Positional arguments to pass to detect_checkboxes.
+            show_progress: Whether to show a progress bar during processing.
+            **kwargs: Keyword arguments to pass to detect_checkboxes.
+        Returns:
+            A new ElementCollection containing all detected checkbox regions.
+        """
+        all_checkboxes = []
+        def _process(el):
+            if hasattr(el, "detect_checkboxes"):
+                # Element supports checkbox detection
+                result = el.detect_checkboxes(*args, **kwargs)
+                if hasattr(result, "elements"):
+                    # Result is a collection
+                    all_checkboxes.extend(result.elements)
+                elif isinstance(result, list):
+                    # Result is a list
+                    all_checkboxes.extend(result)
+                elif result:
+                    # Single result
+                    all_checkboxes.append(result)
+            return el
+        # Use collection's apply helper for optional progress bar
+        self.apply(_process, show_progress=show_progress, desc="Detecting checkboxes")
+        return ElementCollection(all_checkboxes)
     # ------------------------------------------------------------------

natural-pdf 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl

natural-pdf 0.2.18py3-none-any.whl → 0.2.20py3-none-any.whl