PyPI - natural-pdf - Versions diffs - 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl - Mend

natural-pdf 0.2.15py3-none-any.whl → 0.2.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

natural_pdf/__init__.py +45 -0
natural_pdf/analyzers/guides.py +359 -0
natural_pdf/core/element_manager.py +4 -0
natural_pdf/core/page.py +88 -22
natural_pdf/core/page_collection.py +75 -0
natural_pdf/core/pdf.py +33 -0
natural_pdf/describe/base.py +48 -7
natural_pdf/elements/base.py +408 -43
natural_pdf/elements/element_collection.py +83 -10
natural_pdf/elements/region.py +217 -178
natural_pdf/elements/text.py +5 -3
natural_pdf/flows/element.py +48 -46
natural_pdf/flows/flow.py +175 -480
natural_pdf/flows/region.py +76 -0
natural_pdf/selectors/parser.py +180 -9
natural_pdf/utils/pdfminer_patches.py +136 -0
natural_pdf/utils/sections.py +346 -0
natural_pdf/utils/spatial.py +169 -0
{natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/METADATA +1 -1
{natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/RECORD +24 -21
{natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import (
     List,
     Literal,
     Optional,
+    Set,
     Tuple,
     Union,
     overload,
@@ -346,6 +347,7 @@ class Region(
         include_source: bool = False,
         until: Optional[str] = None,
         include_endpoint: bool = True,
+        offset: Optional[float] = None,
         **kwargs,
     ) -> "Region":
         """
@@ -357,11 +359,18 @@ class Region(
             include_source: Whether to include this region in the result (default: False)
             until: Optional selector string to specify an upper boundary element
             include_endpoint: Whether to include the boundary element in the region (default: True)
+            offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
             **kwargs: Additional parameters
         Returns:
             Region object representing the area above
         """
+        # Use global default if offset not provided
+        if offset is None:
+            import natural_pdf
+            offset = natural_pdf.options.layout.directional_offset
         return self._direction(
             direction="above",
             size=height,
@@ -369,6 +378,7 @@ class Region(
             include_source=include_source,
             until=until,
             include_endpoint=include_endpoint,
+            offset=offset,
             **kwargs,
         )
@@ -379,6 +389,7 @@ class Region(
         include_source: bool = False,
         until: Optional[str] = None,
         include_endpoint: bool = True,
+        offset: Optional[float] = None,
         **kwargs,
     ) -> "Region":
         """
@@ -390,11 +401,18 @@ class Region(
             include_source: Whether to include this region in the result (default: False)
             until: Optional selector string to specify a lower boundary element
             include_endpoint: Whether to include the boundary element in the region (default: True)
+            offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
             **kwargs: Additional parameters
         Returns:
             Region object representing the area below
         """
+        # Use global default if offset not provided
+        if offset is None:
+            import natural_pdf
+            offset = natural_pdf.options.layout.directional_offset
         return self._direction(
             direction="below",
             size=height,
@@ -402,16 +420,18 @@ class Region(
             include_source=include_source,
             until=until,
             include_endpoint=include_endpoint,
+            offset=offset,
             **kwargs,
         )
     def left(
         self,
         width: Optional[float] = None,
-        height: str = "full",
+        height: str = "element",
         include_source: bool = False,
         until: Optional[str] = None,
         include_endpoint: bool = True,
+        offset: Optional[float] = None,
         **kwargs,
     ) -> "Region":
         """
@@ -423,11 +443,18 @@ class Region(
             include_source: Whether to include this region in the result (default: False)
             until: Optional selector string to specify a left boundary element
             include_endpoint: Whether to include the boundary element in the region (default: True)
+            offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
             **kwargs: Additional parameters
         Returns:
             Region object representing the area to the left
         """
+        # Use global default if offset not provided
+        if offset is None:
+            import natural_pdf
+            offset = natural_pdf.options.layout.directional_offset
         return self._direction(
             direction="left",
             size=width,
@@ -435,16 +462,18 @@ class Region(
             include_source=include_source,
             until=until,
             include_endpoint=include_endpoint,
+            offset=offset,
             **kwargs,
         )
     def right(
         self,
         width: Optional[float] = None,
-        height: str = "full",
+        height: str = "element",
         include_source: bool = False,
         until: Optional[str] = None,
         include_endpoint: bool = True,
+        offset: Optional[float] = None,
         **kwargs,
     ) -> "Region":
         """
@@ -456,11 +485,18 @@ class Region(
             include_source: Whether to include this region in the result (default: False)
             until: Optional selector string to specify a right boundary element
             include_endpoint: Whether to include the boundary element in the region (default: True)
+            offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
             **kwargs: Additional parameters
         Returns:
             Region object representing the area to the right
         """
+        # Use global default if offset not provided
+        if offset is None:
+            import natural_pdf
+            offset = natural_pdf.options.layout.directional_offset
         return self._direction(
             direction="right",
             size=width,
@@ -468,6 +504,7 @@ class Region(
             include_source=include_source,
             until=until,
             include_endpoint=include_endpoint,
+            offset=offset,
             **kwargs,
         )
@@ -638,12 +675,10 @@ class Region(
         Returns:
             True if the element is in the region, False otherwise
         """
-        # Check if element is on the same page
-        if not hasattr(element, "page") or element.page != self._page:
-            return False
+        # Use centralized spatial utility for consistency
+        from natural_pdf.utils.spatial import is_element_in_region
-        return self.is_element_center_inside(element)
-        # return self.intersects(element)
+        return is_element_in_region(element, self, strategy="center", check_page=True)
     def contains(self, element: "Element") -> bool:
         """
@@ -739,7 +774,12 @@ class Region(
         )
     def exclude(self):
-        self.page.add_exclusion(self)
+        """
+        Exclude this region from text extraction and other operations.
+        This excludes everything within the region's bounds.
+        """
+        self.page.add_exclusion(self, method="region")
     def highlight(
         self,
@@ -1224,12 +1264,36 @@ class Region(
                 selector, apply_exclusions=apply_exclusions, **kwargs
             )
             # Filter those elements to only include ones within this region
-            return [e for e in page_elements if self._is_element_in_region(e)]
+            elements = [e for e in page_elements if self._is_element_in_region(e)]
         else:
             # Get all elements from the page
             page_elements = self.page.get_elements(apply_exclusions=apply_exclusions)
             # Filter to elements in this region
-            return [e for e in page_elements if self._is_element_in_region(e)]
+            elements = [e for e in page_elements if self._is_element_in_region(e)]
+        # Apply boundary exclusions if this is a section with boundary settings
+        if hasattr(self, "_boundary_exclusions") and self._boundary_exclusions != "both":
+            excluded_ids = set()
+            if self._boundary_exclusions == "none":
+                # Exclude both start and end elements
+                if hasattr(self, "start_element") and self.start_element:
+                    excluded_ids.add(id(self.start_element))
+                if hasattr(self, "end_element") and self.end_element:
+                    excluded_ids.add(id(self.end_element))
+            elif self._boundary_exclusions == "start":
+                # Exclude only end element
+                if hasattr(self, "end_element") and self.end_element:
+                    excluded_ids.add(id(self.end_element))
+            elif self._boundary_exclusions == "end":
+                # Exclude only start element
+                if hasattr(self, "start_element") and self.start_element:
+                    excluded_ids.add(id(self.start_element))
+            if excluded_ids:
+                elements = [e for e in elements if id(e) not in excluded_ids]
+        return elements
     def extract_text(
         self,
@@ -1300,6 +1364,34 @@ class Region(
         elif debug:
             logger.debug(f"Region {self.bbox}: Not applying exclusions (apply_exclusions=False).")
+        # Add boundary element exclusions if this is a section with boundary settings
+        if hasattr(self, "_boundary_exclusions") and self._boundary_exclusions != "both":
+            boundary_exclusions = []
+            if self._boundary_exclusions == "none":
+                # Exclude both start and end elements
+                if hasattr(self, "start_element") and self.start_element:
+                    boundary_exclusions.append(self.start_element)
+                if hasattr(self, "end_element") and self.end_element:
+                    boundary_exclusions.append(self.end_element)
+            elif self._boundary_exclusions == "start":
+                # Exclude only end element
+                if hasattr(self, "end_element") and self.end_element:
+                    boundary_exclusions.append(self.end_element)
+            elif self._boundary_exclusions == "end":
+                # Exclude only start element
+                if hasattr(self, "start_element") and self.start_element:
+                    boundary_exclusions.append(self.start_element)
+            # Add boundary elements as exclusion regions
+            for elem in boundary_exclusions:
+                if hasattr(elem, "bbox"):
+                    exclusion_regions.append(elem)
+                    if debug:
+                        logger.debug(
+                            f"Adding boundary exclusion: {elem.extract_text().strip()} at {elem.bbox}"
+                        )
         # 4. Spatially Filter Characters using Utility
         # Pass self as the target_region for precise polygon checks etc.
         filtered_chars = filter_chars_spatially(
@@ -1510,6 +1602,49 @@ class Region(
         logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
+        # For stream method with text-based edge detection and explicit vertical lines,
+        # adjust guides to ensure they fall within text bounds for proper intersection
+        if (
+            effective_method == "pdfplumber"
+            and table_settings.get("horizontal_strategy") == "text"
+            and table_settings.get("vertical_strategy") == "explicit"
+            and "explicit_vertical_lines" in table_settings
+        ):
+            text_elements = self.find_all("text", apply_exclusions=apply_exclusions)
+            if text_elements:
+                text_bounds = text_elements.merge().bbox
+                text_left = text_bounds[0]
+                text_right = text_bounds[2]
+                # Adjust vertical guides to fall within text bounds
+                original_verticals = table_settings["explicit_vertical_lines"]
+                adjusted_verticals = []
+                for v in original_verticals:
+                    if v < text_left:
+                        # Guide is left of text bounds, clip to text start
+                        adjusted_verticals.append(text_left)
+                        logger.debug(
+                            f"Region {self.bbox}: Adjusted left guide from {v:.1f} to {text_left:.1f}"
+                        )
+                    elif v > text_right:
+                        # Guide is right of text bounds, clip to text end
+                        adjusted_verticals.append(text_right)
+                        logger.debug(
+                            f"Region {self.bbox}: Adjusted right guide from {v:.1f} to {text_right:.1f}"
+                        )
+                    else:
+                        # Guide is within text bounds, keep as is
+                        adjusted_verticals.append(v)
+                # Update table settings with adjusted guides
+                table_settings["explicit_vertical_lines"] = adjusted_verticals
+                logger.debug(
+                    f"Region {self.bbox}: Adjusted {len(original_verticals)} guides for stream extraction. "
+                    f"Text bounds: {text_left:.1f}-{text_right:.1f}"
+                )
         # Use the selected method
         if effective_method == "tatr":
             table_rows = self._extract_table_tatr(
@@ -2765,69 +2900,31 @@ class Region(
         if orientation not in ["vertical", "horizontal"]:
             raise ValueError(f"orientation must be 'vertical' or 'horizontal', got '{orientation}'")
-        # Calculate the section boundaries based on orientation and include_boundaries
-        if orientation == "vertical":
-            # Use full width of the parent region for vertical sections
-            x0 = self.x0  # Use parent region's left boundary
-            x1 = self.x1  # Use parent region's right boundary
-            # Determine vertical boundaries based on include_boundaries
-            if include_boundaries == "both":
-                # Include both boundary elements
-                top = start_element.top
-                bottom = end_element.bottom
-            elif include_boundaries == "start":
-                # Include start element, exclude end element
-                top = start_element.top
-                bottom = end_element.top  # Stop at the top of end element
-            elif include_boundaries == "end":
-                # Exclude start element, include end element
-                top = start_element.bottom  # Start at the bottom of start element
-                bottom = end_element.bottom
-            else:  # "none"
-                # Exclude both boundary elements
-                top = start_element.bottom  # Start at the bottom of start element
-                bottom = end_element.top  # Stop at the top of end element
-            # Ensure valid boundaries
-            if top >= bottom:
-                logger.debug(f"Invalid section boundaries: top={top} >= bottom={bottom}")
-                # Return an empty region
-                return Region(self.page, (x0, top, x0, top))
-        else:  # horizontal
-            # Use full height of the parent region for horizontal sections
-            top = self.top  # Use parent region's top boundary
-            bottom = self.bottom  # Use parent region's bottom boundary
-            # Determine horizontal boundaries based on include_boundaries
-            if include_boundaries == "both":
-                # Include both boundary elements
-                x0 = start_element.x0
-                x1 = end_element.x1
-            elif include_boundaries == "start":
-                # Include start element, exclude end element
-                x0 = start_element.x0
-                x1 = end_element.x0  # Stop at the left of end element
-            elif include_boundaries == "end":
-                # Exclude start element, include end element
-                x0 = start_element.x1  # Start at the right of start element
-                x1 = end_element.x1
-            else:  # "none"
-                # Exclude both boundary elements
-                x0 = start_element.x1  # Start at the right of start element
-                x1 = end_element.x0  # Stop at the left of end element
-            # Ensure valid boundaries
-            if x0 >= x1:
-                logger.debug(f"Invalid section boundaries: x0={x0} >= x1={x1}")
-                # Return an empty region
-                return Region(self.page, (x0, top, x0, top))
+        # Use centralized section utilities
+        from natural_pdf.utils.sections import calculate_section_bounds, validate_section_bounds
+        # Calculate section boundaries
+        bounds = calculate_section_bounds(
+            start_element=start_element,
+            end_element=end_element,
+            include_boundaries=include_boundaries,
+            orientation=orientation,
+            parent_bounds=self.bbox,
+        )
+        # Validate boundaries
+        if not validate_section_bounds(bounds, orientation):
+            # Return an empty region at the start position
+            x0, top, _, _ = bounds
+            return Region(self.page, (x0, top, x0, top))
         # Create new region
-        section = Region(self.page, (x0, top, x1, bottom))
-        # Store the original boundary elements for reference
+        section = Region(self.page, bounds)
+        # Store the original boundary elements and exclusion info
         section.start_element = start_element
         section.end_element = end_element
+        section._boundary_exclusions = include_boundaries
         return section
@@ -2851,121 +2948,63 @@ class Region(
             List of Region objects representing the extracted sections
         """
         from natural_pdf.elements.element_collection import ElementCollection
+        from natural_pdf.utils.sections import extract_sections_from_region
+        # Use centralized section extraction logic
+        sections = extract_sections_from_region(
+            region=self,
+            start_elements=start_elements,
+            end_elements=end_elements,
+            include_boundaries=include_boundaries,
+            orientation=orientation,
+        )
-        # Process string selectors to find elements WITHIN THIS REGION
-        if isinstance(start_elements, str):
-            start_elements = self.find_all(start_elements)  # Use region's find_all
-            if hasattr(start_elements, "elements"):
-                start_elements = start_elements.elements
+        return ElementCollection(sections)
-        if isinstance(end_elements, str):
-            end_elements = self.find_all(end_elements)  # Use region's find_all
-            if hasattr(end_elements, "elements"):
-                end_elements = end_elements.elements
+    def split(self, divider, **kwargs) -> "ElementCollection[Region]":
+        """
+        Divide this region into sections based on the provided divider elements.
-        # Ensure start_elements is a list (or similar iterable)
-        if start_elements is None or not hasattr(start_elements, "__iter__"):
-            logger.warning(
-                "get_sections requires valid start_elements (selector or list). Returning empty."
-            )
-            return []
-        # Ensure end_elements is a list if provided
-        if end_elements is not None and not hasattr(end_elements, "__iter__"):
-            logger.warning("end_elements must be iterable if provided. Ignoring.")
-            end_elements = []
-        elif end_elements is None:
-            end_elements = []
-        # If no start elements found within the region, return empty list
-        if not start_elements:
-            return []
+        Args:
+            divider: Elements or selector string that mark section boundaries
+            **kwargs: Additional parameters passed to get_sections()
+                - include_boundaries: How to include boundary elements (default: 'start')
+                - orientation: 'vertical' or 'horizontal' (default: 'vertical')
-        # Sort all elements within the region based on orientation
-        all_elements_in_region = self.get_elements()
-        if orientation == "vertical":
-            all_elements_in_region.sort(key=lambda e: (e.top, e.x0))
-        else:  # horizontal
-            all_elements_in_region.sort(key=lambda e: (e.x0, e.top))
-        if not all_elements_in_region:
-            return []  # Cannot create sections if region is empty
-        # Map elements to their indices in the sorted list
-        element_to_index = {el: i for i, el in enumerate(all_elements_in_region)}
-        # Mark section boundaries using indices from the sorted list
-        section_boundaries = []
-        # Add start element indexes
-        for element in start_elements:
-            idx = element_to_index.get(element)
-            if idx is not None:
-                section_boundaries.append({"index": idx, "element": element, "type": "start"})
-            # else: Element found by selector might not be geometrically in region? Log warning?
-        # Add end element indexes if provided
-        for element in end_elements:
-            idx = element_to_index.get(element)
-            if idx is not None:
-                section_boundaries.append({"index": idx, "element": element, "type": "end"})
-        # Sort boundaries by index (document order within the region)
-        section_boundaries.sort(key=lambda x: x["index"])
-        # Generate sections
-        sections = []
-        current_start_boundary = None
-        for i, boundary in enumerate(section_boundaries):
-            # If it's a start boundary and we don't have a current start
-            if boundary["type"] == "start" and current_start_boundary is None:
-                current_start_boundary = boundary
-            # If it's an end boundary and we have a current start
-            elif boundary["type"] == "end" and current_start_boundary is not None:
-                # Create a section from current_start to this boundary
-                start_element = current_start_boundary["element"]
-                end_element = boundary["element"]
-                # Use the helper, ensuring elements are from within the region
-                section = self.get_section_between(
-                    start_element, end_element, include_boundaries, orientation
-                )
-                sections.append(section)
-                current_start_boundary = None  # Reset
-            # If it's another start boundary and we have a current start (split by starts only)
-            elif (
-                boundary["type"] == "start"
-                and current_start_boundary is not None
-                and not end_elements
-            ):
-                # End the previous section just before this start boundary
-                start_element = current_start_boundary["element"]
-                # Find the element immediately preceding this start in the sorted list
-                end_idx = boundary["index"] - 1
-                if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
-                    end_element = all_elements_in_region[end_idx]
-                    section = self.get_section_between(
-                        start_element, end_element, include_boundaries, orientation
+        Returns:
+            ElementCollection of Region objects representing the sections
+        Example:
+            # Split a region by bold text
+            sections = region.split("text:bold")
+            # Split horizontally by vertical lines
+            sections = region.split("line[orientation=vertical]", orientation="horizontal")
+        """
+        # Default to 'start' boundaries for split (include divider at start of each section)
+        if "include_boundaries" not in kwargs:
+            kwargs["include_boundaries"] = "start"
+        sections = self.get_sections(start_elements=divider, **kwargs)
+        # Add section before first divider if there's content
+        if sections and hasattr(sections[0], "start_element"):
+            first_divider = sections[0].start_element
+            if first_divider:
+                # Get all elements before the first divider
+                all_elements = self.get_elements()
+                if all_elements and all_elements[0] != first_divider:
+                    # Create section from start to just before first divider
+                    initial_section = self.get_section_between(
+                        start_element=None,
+                        end_element=first_divider,
+                        include_boundaries="none",
+                        orientation=kwargs.get("orientation", "vertical"),
                     )
-                    sections.append(section)
-                # Else: Section started and ended by consecutive start elements? Create empty?
-                # For now, just reset and start new section
-                # Start the new section
-                current_start_boundary = boundary
-        # Handle the last section if we have a current start
-        if current_start_boundary is not None:
-            start_element = current_start_boundary["element"]
-            # End at the last element within the region
-            end_element = all_elements_in_region[-1]
-            section = self.get_section_between(
-                start_element, end_element, include_boundaries, orientation
-            )
-            sections.append(section)
+                    if initial_section and initial_section.get_elements():
+                        sections.insert(0, initial_section)
-        return ElementCollection(sections)
+        return sections
     def create_cells(self):
         """

natural_pdf/elements/text.py CHANGED Viewed

@@ -459,9 +459,11 @@ class TextElement(Element):
     @property
     def highlight_color(self):
         """Return RGB(A) tuple of highlight colour if stored."""
-        return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get(
-            "highlight_color"
-        )
+        # Check _obj first, being careful with falsy values like 0.0
+        if "highlight_color" in self._obj:
+            return self._obj["highlight_color"]
+        # Fall back to metadata
+        return self.metadata.get("decoration", {}).get("highlight_color")
     def __repr__(self) -> str:
         """String representation of the text element."""

natural-pdf 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl

natural-pdf 0.2.15py3-none-any.whl → 0.2.17py3-none-any.whl