PyPI - natural-pdf - Versions diffs - 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl - Mend

natural-pdf 0.2.18py3-none-any.whl → 0.2.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

natural_pdf/__init__.py +8 -0
natural_pdf/analyzers/checkbox/__init__.py +6 -0
natural_pdf/analyzers/checkbox/base.py +265 -0
natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
natural_pdf/analyzers/checkbox/mixin.py +95 -0
natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
natural_pdf/collections/mixins.py +14 -5
natural_pdf/core/element_manager.py +5 -1
natural_pdf/core/page.py +61 -0
natural_pdf/core/page_collection.py +41 -1
natural_pdf/core/pdf.py +24 -1
natural_pdf/describe/base.py +20 -0
natural_pdf/elements/base.py +152 -10
natural_pdf/elements/element_collection.py +41 -2
natural_pdf/elements/region.py +115 -2
natural_pdf/judge.py +1509 -0
natural_pdf/selectors/parser.py +42 -1
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +41 -17
temp/check_model.py +49 -0
temp/check_pdf_content.py +9 -0
temp/checkbox_checks.py +590 -0
temp/checkbox_simple.py +117 -0
temp/checkbox_ux_ideas.py +400 -0
temp/context_manager_prototype.py +177 -0
temp/convert_to_hf.py +60 -0
temp/demo_text_closest.py +66 -0
temp/inspect_model.py +43 -0
temp/rtdetr_dinov2_test.py +49 -0
temp/test_closest_debug.py +26 -0
temp/test_closest_debug2.py +22 -0
temp/test_context_exploration.py +85 -0
temp/test_durham.py +30 -0
temp/test_empty_string.py +16 -0
temp/test_similarity.py +15 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -19,6 +19,7 @@ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
 from tqdm.auto import tqdm
+from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
 from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
 # --- Shape Detection Mixin --- #
@@ -75,12 +76,41 @@ except ImportError:
 logger = logging.getLogger(__name__)
+class RegionContext:
+    """Context manager for constraining directional operations to a region."""
+    def __init__(self, region: "Region"):
+        """Initialize the context manager with a region.
+        Args:
+            region: The Region to use as a constraint for directional operations
+        """
+        self.region = region
+        self.previous_within = None
+    def __enter__(self):
+        """Enter the context, setting the global directional_within option."""
+        import natural_pdf
+        self.previous_within = natural_pdf.options.layout.directional_within
+        natural_pdf.options.layout.directional_within = self.region
+        return self.region
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Exit the context, restoring the previous directional_within option."""
+        import natural_pdf
+        natural_pdf.options.layout.directional_within = self.previous_within
+        return False  # Don't suppress exceptions
 class Region(
     TextMixin,
     DirectionalMixin,
     ClassificationMixin,
     ExtractionMixin,
     ShapeDetectionMixin,
+    CheckboxDetectionMixin,
     DescribeMixin,
     VisualSearchMixin,
     Visualizable,
@@ -574,6 +604,16 @@ class Region(
                 (self.x0, self.bottom),  # bottom-left
             ]
+    @property
+    def origin(self) -> Optional[Union["Element", "Region"]]:
+        """The element/region that created this region (if it was created via directional method)."""
+        return getattr(self, "source_element", None)
+    @property
+    def endpoint(self) -> Optional["Element"]:
+        """The element where this region stopped (if created with 'until' parameter)."""
+        return getattr(self, "boundary_element", None)
     def _is_point_in_polygon(self, x: float, y: float) -> bool:
         """
         Check if a point is inside the polygon using ray casting algorithm.
@@ -1297,9 +1337,11 @@ class Region(
     def extract_text(
         self,
+        granularity: str = "chars",
         apply_exclusions: bool = True,
         debug: bool = False,
         *,
+        overlap: str = "center",
         newlines: Union[bool, str] = True,
         content_filter=None,
         **kwargs,
@@ -1309,8 +1351,15 @@ class Region(
         layout engine (chars_to_textmap).
         Args:
+            granularity: Level of text extraction - 'chars' (default) or 'words'.
+                - 'chars': Character-by-character extraction (current behavior)
+                - 'words': Word-level extraction with configurable overlap
             apply_exclusions: Whether to apply exclusion regions defined on the parent page.
             debug: Enable verbose debugging output for filtering steps.
+            overlap: How to determine if words overlap with the region (only used when granularity='words'):
+                - 'center': Word center point must be inside (default)
+                - 'full': Word must be fully inside the region
+                - 'partial': Any overlap includes the word
             newlines: Whether to strip newline characters from the extracted text.
             content_filter: Optional content filter to exclude specific text patterns. Can be:
                 - A regex pattern string (characters matching the pattern are EXCLUDED)
@@ -1323,10 +1372,41 @@ class Region(
         Returns:
             Extracted text as string, potentially with layout-based spacing.
         """
+        # Validate granularity parameter
+        if granularity not in ("chars", "words"):
+            raise ValueError(f"granularity must be 'chars' or 'words', got '{granularity}'")
         # Allow 'debug_exclusions' for backward compatibility
         debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
-        logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
+        logger.debug(
+            f"Region {self.bbox}: extract_text called with granularity='{granularity}', overlap='{overlap}', kwargs: {kwargs}"
+        )
+        # Handle word-level extraction
+        if granularity == "words":
+            # Use find_all to get words with proper overlap and exclusion handling
+            word_elements = self.find_all(
+                "text", overlap=overlap, apply_exclusions=apply_exclusions
+            )
+            # Join the text from all matching words
+            text_parts = []
+            for word in word_elements:
+                word_text = word.extract_text()
+                if word_text:  # Skip empty strings
+                    text_parts.append(word_text)
+            result = " ".join(text_parts)
+            # Apply newlines processing if requested
+            if newlines is False:
+                result = result.replace("\n", " ").replace("\r", " ")
+            elif isinstance(newlines, str):
+                result = result.replace("\n", newlines).replace("\r", newlines)
+            return result
+        # Original character-level extraction logic follows...
         # 1. Get Word Elements potentially within this region (initial broad phase)
         # Optimization: Could use spatial query if page elements were indexed
         page_words = self.page.words  # Get all words from the page
@@ -3309,7 +3389,14 @@ class Region(
         name_info = f" name='{self.name}'" if self.name else ""
         type_info = f" type='{self.region_type}'" if self.region_type else ""
         source_info = f" source='{self.source}'" if self.source else ""
-        return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
+        # Add checkbox state if this is a checkbox
+        checkbox_info = ""
+        if self.region_type == "checkbox" and hasattr(self, "is_checked"):
+            state = "checked" if self.is_checked else "unchecked"
+            checkbox_info = f" [{state}]"
+        return f"<Region{name_info}{type_info}{source_info}{checkbox_info} bbox={self.bbox}{poly_info}>"
     def update_text(
         self,
@@ -4038,3 +4125,29 @@ class Region(
         except Exception as e:
             logger.error(f"Error creating viewer for region {self.bbox}: {e}", exc_info=True)
             return None
+    def within(self):
+        """Context manager that constrains directional operations to this region.
+        When used as a context manager, all directional navigation operations
+        (above, below, left, right) will be constrained to the bounds of this region.
+        Returns:
+            RegionContext: A context manager that yields this region
+        Examples:
+            ```python
+            # Create a column region
+            left_col = page.region(right=page.width/2)
+            # All directional operations are constrained to left_col
+            with left_col.within() as col:
+                header = col.find("text[size>14]")
+                content = header.below(until="text[size>14]")
+                # content will only include elements within left_col
+            # Operations outside the context are not constrained
+            full_page_below = header.below()  # Searches full page
+            ```
+        """
+        return RegionContext(self)

natural-pdf 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl

natural-pdf 0.2.18py3-none-any.whl → 0.2.19py3-none-any.whl