PyPI - natural-pdf - Versions diffs - 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +670 -595
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +188 -82
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +132 -16
natural_pdf/core/pdf.py +486 -71
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +238 -111
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
natural_pdf-0.1.34.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.32.dist-info/RECORD +0 -118
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0

natural_pdf/elements/rect.py CHANGED Viewed

@@ -88,20 +88,6 @@ class RectangleElement(Element):
         """Get the stroke width of the rectangle."""
         return self._obj.get("linewidth", 0)
-    def text_inside(self, **kwargs) -> Any:
-        """
-        Get text elements inside this rectangle.
-        Args:
-            **kwargs: Additional filter parameters
-        Returns:
-            ElementCollection of text elements inside this rectangle
-        """
-        from natural_pdf.elements.collections import ElementCollection
-        # TODO: Implement proper filtering of elements inside this rectangle
-        return ElementCollection([])  # Placeholder
     def extract_text(self, **kwargs) -> str:
         """

natural_pdf/elements/region.py CHANGED Viewed

@@ -21,15 +21,15 @@ from natural_pdf.elements.text import TextElement  # ADDED IMPORT
 from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import utility
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
-from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
-# Import new utils
-from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
 # ------------------------------------------------------------------
 # Table utilities
 # ------------------------------------------------------------------
 from natural_pdf.tables import TableResult
+from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
+# Import new utils
+from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
 # --- End Classification Imports --- #
@@ -55,9 +55,70 @@ except ImportError:
 logger = logging.getLogger(__name__)
-class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
-    """
-    Represents a rectangular region on a page.
+class Region(
+    DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin
+):
+    """Represents a rectangular region on a page.
+    Regions are fundamental building blocks in natural-pdf that define rectangular
+    areas of a page for analysis, extraction, and navigation. They can be created
+    manually or automatically through spatial navigation methods like .below(), .above(),
+    .left(), and .right() from elements or other regions.
+    Regions integrate multiple analysis capabilities through mixins and provide:
+    - Element filtering and collection within the region boundary
+    - OCR processing for the region area
+    - Table detection and extraction
+    - AI-powered classification and structured data extraction
+    - Visual rendering and debugging capabilities
+    - Text extraction with spatial awareness
+    The Region class supports both rectangular and polygonal boundaries, making it
+    suitable for complex document layouts and irregular shapes detected by layout
+    analysis algorithms.
+    Attributes:
+        page: Reference to the parent Page object.
+        bbox: Bounding box tuple (x0, top, x1, bottom) in PDF coordinates.
+        x0: Left x-coordinate.
+        top: Top y-coordinate (minimum y).
+        x1: Right x-coordinate.
+        bottom: Bottom y-coordinate (maximum y).
+        width: Region width (x1 - x0).
+        height: Region height (bottom - top).
+        polygon: List of coordinate points for non-rectangular regions.
+        label: Optional descriptive label for the region.
+        metadata: Dictionary for storing analysis results and custom data.
+    Example:
+        Creating regions:
+        ```python
+        pdf = npdf.PDF("document.pdf")
+        page = pdf.pages[0]
+        # Manual region creation
+        header_region = page.region(0, 0, page.width, 100)
+        # Spatial navigation from elements
+        summary_text = page.find('text:contains("Summary")')
+        content_region = summary_text.below(until='text[size>12]:bold')
+        # Extract content from region
+        tables = content_region.extract_table()
+        text = content_region.get_text()
+        ```
+        Advanced usage:
+        ```python
+        # OCR processing
+        region.apply_ocr(engine='easyocr', resolution=300)
+        # AI-powered extraction
+        data = region.extract_structured_data(MySchema)
+        # Visual debugging
+        region.show(highlights=['tables', 'text'])
+        ```
     """
     def __init__(
@@ -68,23 +129,46 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         parent=None,
         label: Optional[str] = None,
     ):
-        """
-        Initialize a region.
+        """Initialize a region.
+        Creates a Region object that represents a rectangular or polygonal area on a page.
+        Regions are used for spatial navigation, content extraction, and analysis operations.
         Args:
-            page: Parent page
-            bbox: Bounding box as (x0, top, x1, bottom)
-            polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
-            parent: Optional parent region (for hierarchical document structure)
+            page: Parent Page object that contains this region and provides access
+                to document elements and analysis capabilities.
+            bbox: Bounding box coordinates as (x0, top, x1, bottom) tuple in PDF
+                coordinate system (points, with origin at bottom-left).
+            polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for
+                non-rectangular regions. If provided, the region will use polygon-based
+                intersection calculations instead of simple rectangle overlap.
+            parent: Optional parent region for hierarchical document structure.
+                Useful for maintaining tree-like relationships between regions.
+            label: Optional descriptive label for the region, useful for debugging
+                and identification in complex workflows.
+        Example:
+            ```python
+            pdf = npdf.PDF("document.pdf")
+            page = pdf.pages[0]
+            # Rectangular region
+            header = Region(page, (0, 0, page.width, 100), label="header")
+            # Polygonal region (from layout detection)
+            table_polygon = [(50, 100), (300, 100), (300, 400), (50, 400)]
+            table_region = Region(page, (50, 100, 300, 400),
+                                polygon=table_polygon, label="table")
+            ```
+        Note:
+            Regions are typically created through page methods like page.region() or
+            spatial navigation methods like element.below(). Direct instantiation is
+            used mainly for advanced workflows or layout analysis integration.
         """
         self._page = page
         self._bbox = bbox
         self._polygon = polygon
-        self._multi_page_elements = None
-        self._spans_pages = False
-        self._page_range = None
-        self.start_element = None
-        self.end_element = None
         self.metadata: Dict[str, Any] = {}
         # Analysis results live under self.metadata['analysis'] via property
@@ -444,10 +528,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Returns:
             True if the element is in the region, False otherwise
         """
-        # If we have multi-page elements cached, check if the element is in the list
-        if self._spans_pages and self._multi_page_elements is not None:
-            return element in self._multi_page_elements
         # Check if element is on the same page
         if not hasattr(element, "page") or element.page != self._page:
             return False
@@ -614,12 +694,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         """
         # Apply global options as defaults
         import natural_pdf
         if resolution is None:
             if natural_pdf.options.image.resolution is not None:
                 resolution = natural_pdf.options.image.resolution
             else:
                 resolution = 144  # Default resolution when none specified
         # Handle the case where user wants the cropped region to have a specific width
         page_kwargs = kwargs.copy()
         effective_resolution = resolution  # Start with the provided resolution
@@ -722,12 +803,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         """
         # Apply global options as defaults
         import natural_pdf
         if resolution is None:
             if natural_pdf.options.image.resolution is not None:
                 resolution = natural_pdf.options.image.resolution
             else:
                 resolution = 144  # Default resolution when none specified
         if not self._page:
             raise ValueError("Region must be associated with a page to show.")
@@ -764,7 +846,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         )
     def save(
-        self, filename: str, resolution: Optional[float] = None, labels: bool = True, legend_position: str = "right"
+        self,
+        filename: str,
+        resolution: Optional[float] = None,
+        labels: bool = True,
+        legend_position: str = "right",
     ) -> "Region":
         """
         Save the page with this region highlighted to an image file.
@@ -780,17 +866,20 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         """
         # Apply global options as defaults
         import natural_pdf
         if resolution is None:
             if natural_pdf.options.image.resolution is not None:
                 resolution = natural_pdf.options.image.resolution
             else:
                 resolution = 144  # Default resolution when none specified
         # Highlight this region if not already highlighted
         self.highlight()
         # Save the highlighted image
-        self._page.save_image(filename, resolution=resolution, labels=labels, legend_position=legend_position)
+        self._page.save_image(
+            filename, resolution=resolution, labels=labels, legend_position=legend_position
+        )
         return self
     def save_image(
@@ -816,12 +905,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         """
         # Apply global options as defaults
         import natural_pdf
         if resolution is None:
             if natural_pdf.options.image.resolution is not None:
                 resolution = natural_pdf.options.image.resolution
             else:
                 resolution = 144  # Default resolution when none specified
         # Get the region image
         image = self.to_image(
             resolution=resolution,
@@ -856,27 +946,34 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
                        This helps avoid detecting box borders/slivers as content.
-        Returns:
-            New Region with visual whitespace trimmed from all edges
+        Returns
+        ------
-        Example:
-            # Basic trimming with 1 pixel padding and 0.5px pre-shrink
-            trimmed = region.trim()
+        New Region with visual whitespace trimmed from all edges
-            # More aggressive trimming with no padding and no pre-shrink
-            tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
+        Examples
+        --------
-            # Conservative trimming with more padding
-            loose = region.trim(padding=3, threshold=0.98)
+        ```python
+        # Basic trimming with 1 pixel padding and 0.5px pre-shrink
+        trimmed = region.trim()
+        # More aggressive trimming with no padding and no pre-shrink
+        tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
+        # Conservative trimming with more padding
+        loose = region.trim(padding=3, threshold=0.98)
+        ```
         """
         # Apply global options as defaults
         import natural_pdf
         if resolution is None:
             if natural_pdf.options.image.resolution is not None:
                 resolution = natural_pdf.options.image.resolution
             else:
                 resolution = 144  # Default resolution when none specified
         # Pre-shrink the region to avoid box slivers
         work_region = (
             self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
@@ -885,9 +982,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         )
         # Get the region image
-        image = work_region.to_image(
-            resolution=resolution, crop=True, include_highlights=False
-        )
+        image = work_region.to_image(resolution=resolution, crop=True, include_highlights=False)
         if image is None:
             logger.warning(
@@ -1113,12 +1208,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Returns:
             List of elements in the region
         """
-        # If we have multi-page elements, return those
-        if self._spans_pages and self._multi_page_elements is not None:
-            # TODO: Apply selector to multi-page elements if needed
-            return self._multi_page_elements
-        # Otherwise, get elements from the page
         if selector:
             # Find elements on the page matching the selector
             page_elements = self.page.find_all(
@@ -1257,7 +1346,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                 try:
                     cell_regions_in_table = [
                         c
-                        for c in self.page.find_all("region[type=table_cell]", apply_exclusions=False)
+                        for c in self.page.find_all(
+                            "region[type=table_cell]", apply_exclusions=False
+                        )
                         if self.intersects(c)
                     ]
                 except Exception as _cells_err:
@@ -1324,7 +1415,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         # This must happen AFTER alias handling (so strategies are final)
         # and BEFORE we delegate to _extract_table_* helpers.
         # -------------------------------------------------------------
-        if "text" in (table_settings.get("vertical_strategy"), table_settings.get("horizontal_strategy")):
+        if "text" in (
+            table_settings.get("vertical_strategy"),
+            table_settings.get("horizontal_strategy"),
+        ):
             page_cfg = getattr(self.page, "_config", {})
             # Ensure text_* tolerances passed to pdfplumber
             if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
@@ -1466,19 +1560,35 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             table_settings.get("vertical_strategy"),
             table_settings.get("horizontal_strategy"),
         )
-        if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
+        if (
+            _uses_text
+            and "text_x_tolerance" not in table_settings
+            and "x_tolerance" not in table_settings
+        ):
             x_tol = pdf_cfg.get("x_tolerance")
             if x_tol is not None:
                 table_settings.setdefault("text_x_tolerance", x_tol)
-        if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
+        if (
+            _uses_text
+            and "text_y_tolerance" not in table_settings
+            and "y_tolerance" not in table_settings
+        ):
             y_tol = pdf_cfg.get("y_tolerance")
             if y_tol is not None:
                 table_settings.setdefault("text_y_tolerance", y_tol)
-        if _uses_text and "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
+        if (
+            _uses_text
+            and "snap_tolerance" not in table_settings
+            and "snap_x_tolerance" not in table_settings
+        ):
             snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
             table_settings.setdefault("snap_tolerance", snap)
-        if _uses_text and "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
+        if (
+            _uses_text
+            and "join_tolerance" not in table_settings
+            and "join_x_tolerance" not in table_settings
+        ):
             join = table_settings.get("snap_tolerance", 1)
             table_settings.setdefault("join_tolerance", join)
             table_settings.setdefault("join_x_tolerance", join)
@@ -1510,11 +1620,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             table_settings.get("vertical_strategy"),
             table_settings.get("horizontal_strategy"),
         )
-        if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
+        if (
+            _uses_text
+            and "text_x_tolerance" not in table_settings
+            and "x_tolerance" not in table_settings
+        ):
             x_tol = pdf_cfg.get("x_tolerance")
             if x_tol is not None:
                 table_settings.setdefault("text_x_tolerance", x_tol)
-        if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
+        if (
+            _uses_text
+            and "text_y_tolerance" not in table_settings
+            and "y_tolerance" not in table_settings
+        ):
             y_tol = pdf_cfg.get("y_tolerance")
             if y_tol is not None:
                 table_settings.setdefault("text_y_tolerance", y_tol)
@@ -1942,23 +2060,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         else:
             raise ValueError("Internal error: No selector or text provided.")
-        # If we span multiple pages, filter our elements
-        # TODO: Revisit multi-page region logic
-        if self._spans_pages and self._multi_page_elements is not None:
-            logger.warning("find_all on multi-page regions is not fully implemented.")
-            # Temporary: Apply filter directly to cached elements
-            try:
-                selector_obj = parse_selector(effective_selector)
-                # Pass regex/case flags down
-                kwargs["regex"] = regex
-                kwargs["case"] = case
-                filter_func = selector_to_filter_func(selector_obj, **kwargs)
-                matching = [el for el in self._multi_page_elements if filter_func(el)]
-                return ElementCollection(matching)
-            except Exception as e:
-                logger.error(f"Error applying selector to multi-page region elements: {e}")
-                return ElementCollection([])
         # Normal case: Region is on a single page
         try:
             # Parse the final selector string
@@ -2016,10 +2117,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Examples
         ---------
-        >>> def llm_ocr(region):
-        ...     image = region.to_image(resolution=300, crop=True)
-        ...     return my_llm_client.ocr(image)
-        >>> region.apply_ocr(function=llm_ocr)
+        ```python
+        def llm_ocr(region):
+            image = region.to_image(resolution=300, crop=True)
+            return my_llm_client.ocr(image)
+        region.apply_ocr(function=llm_ocr)
+        ```
         Args:
             replace: Whether to remove existing OCR elements first (default ``True``).
@@ -2088,15 +2191,24 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             # Remove OCR CHAR dicts overlapping region
             for char in list(self.page._element_mgr.chars):
                 # char can be dict or TextElement; normalise
-                char_src = char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
+                char_src = (
+                    char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
+                )
                 if char_src == "ocr":
                     # Rough bbox for dicts
                     if isinstance(char, dict):
-                        cx0, ctop, cx1, cbottom = char.get("x0", 0), char.get("top", 0), char.get("x1", 0), char.get("bottom", 0)
+                        cx0, ctop, cx1, cbottom = (
+                            char.get("x0", 0),
+                            char.get("top", 0),
+                            char.get("x1", 0),
+                            char.get("bottom", 0),
+                        )
                     else:
                         cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
                     # Quick overlap check
-                    if not (cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom):
+                    if not (
+                        cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom
+                    ):
                         _safe_remove(char)
             logger.info(
@@ -2219,7 +2331,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         """
         Apply a custom OCR function to this region and create text elements from the results.
-        This is useful when you want to use a custom OCR method (e.g., an LLM API,
+        This is useful when you want to use a custom OCR method (e.g., an LLM API,
         specialized OCR service, or any custom logic) instead of the built-in OCR engines.
         Args:
@@ -2244,15 +2356,15 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                 image = region.to_image(resolution=300, crop=True)
                 # Call your LLM API here
                 return llm_client.ocr(image)
             region.apply_custom_ocr(ocr_with_llm)
             # Using with a custom OCR service
             def ocr_with_service(region):
                 img_bytes = region.to_image(crop=True).tobytes()
                 response = ocr_service.process(img_bytes)
                 return response.text
             region.apply_custom_ocr(ocr_with_service, source_label="my-ocr-service")
         """
         # If replace is True, remove existing OCR elements in this region
@@ -2260,9 +2372,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             logger.info(
                 f"Region {self.bbox}: Removing existing OCR elements before applying custom OCR."
             )
             removed_count = 0
             # Helper to remove a single element safely
             def _safe_remove(elem):
                 nonlocal removed_count
@@ -2281,41 +2393,60 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                         success = False
                 if success:
                     removed_count += 1
-            # Remove OCR elements overlapping this region
-            for word in list(self.page._element_mgr.words):
-                if getattr(word, "source", "").startswith("ocr") and self.intersects(word):
-                    _safe_remove(word)
-            # Also check custom-ocr sources
+            # Remove ALL OCR elements overlapping this region
+            # Remove elements with source=="ocr" (built-in OCR) or matching the source_label (previous custom OCR)
             for word in list(self.page._element_mgr.words):
-                if getattr(word, "source", "") == source_label and self.intersects(word):
+                word_source = getattr(word, "source", "")
+                # Match built-in OCR behavior: remove elements with source "ocr" exactly
+                # Also remove elements with the same source_label to avoid duplicates
+                if (word_source == "ocr" or word_source == source_label) and self.intersects(word):
                     _safe_remove(word)
-            if removed_count > 0:
-                logger.info(
-                    f"Region {self.bbox}: Removed {removed_count} existing OCR elements."
+            # Also remove char dicts if needed (matching built-in OCR)
+            for char in list(self.page._element_mgr.chars):
+                # char can be dict or TextElement; normalize
+                char_src = (
+                    char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
                 )
+                if char_src == "ocr" or char_src == source_label:
+                    # Rough bbox for dicts
+                    if isinstance(char, dict):
+                        cx0, ctop, cx1, cbottom = (
+                            char.get("x0", 0),
+                            char.get("top", 0),
+                            char.get("x1", 0),
+                            char.get("bottom", 0),
+                        )
+                    else:
+                        cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
+                    # Quick overlap check
+                    if not (
+                        cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom
+                    ):
+                        _safe_remove(char)
+            if removed_count > 0:
+                logger.info(f"Region {self.bbox}: Removed {removed_count} existing OCR elements.")
         # Call the custom OCR function
         try:
             logger.debug(f"Region {self.bbox}: Calling custom OCR function...")
             ocr_text = ocr_function(self)
             if ocr_text is not None and not isinstance(ocr_text, str):
                 logger.warning(
                     f"Custom OCR function returned non-string type ({type(ocr_text)}). "
                     f"Converting to string."
                 )
                 ocr_text = str(ocr_text)
         except Exception as e:
             logger.error(
-                f"Error calling custom OCR function for region {self.bbox}: {e}",
-                exc_info=True
+                f"Error calling custom OCR function for region {self.bbox}: {e}", exc_info=True
             )
             return self
         # Create text element if we got text
         if ocr_text is not None:
             # Use the to_text_element method to create the element
@@ -2323,16 +2454,16 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                 text_content=ocr_text,
                 source_label=source_label,
                 confidence=confidence,
-                add_to_page=add_to_page
+                add_to_page=add_to_page,
             )
             logger.info(
                 f"Region {self.bbox}: Created text element with {len(ocr_text)} chars"
                 f"{' and added to page' if add_to_page else ''}"
             )
         else:
             logger.debug(f"Region {self.bbox}: Custom OCR function returned None (no text found)")
         return self
     def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
@@ -3280,9 +3411,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             return []
         # Build arrays of centers
-        centers = np.array([
-            [(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions
-        ])
+        centers = np.array([[(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions])
         xs = centers[:, 0]
         ys = centers[:, 1]
@@ -3314,5 +3443,3 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             table_grid[row_idx][col_idx] = text_val if text_val else None
         return table_grid

natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl