PyPI - natural-pdf - Versions diffs - 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl - Mend

natural-pdf 0.2.16py3-none-any.whl → 0.2.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

natural_pdf/__init__.py +45 -0
natural_pdf/analyzers/guides.py +359 -0
natural_pdf/core/element_manager.py +4 -0
natural_pdf/core/page.py +130 -31
natural_pdf/core/page_collection.py +75 -0
natural_pdf/core/pdf.py +33 -0
natural_pdf/describe/base.py +48 -7
natural_pdf/elements/base.py +408 -43
natural_pdf/elements/element_collection.py +83 -10
natural_pdf/elements/region.py +217 -178
natural_pdf/elements/text.py +5 -3
natural_pdf/flows/element.py +1 -0
natural_pdf/flows/flow.py +175 -480
natural_pdf/flows/region.py +76 -0
natural_pdf/selectors/parser.py +180 -9
natural_pdf/utils/pdfminer_patches.py +136 -0
natural_pdf/utils/sections.py +346 -0
natural_pdf/utils/spatial.py +172 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/METADATA +1 -1
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/RECORD +24 -21
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/top_level.txt +0 -0

natural_pdf/core/page.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import base64
 import concurrent.futures  # Added import
+import contextlib
 import hashlib
 import io
 import json
@@ -30,6 +31,7 @@ from tqdm.auto import tqdm  # Added tqdm import
 from natural_pdf.elements.element_collection import ElementCollection
 from natural_pdf.elements.region import Region
 from natural_pdf.selectors.parser import parse_selector
+from natural_pdf.tables.result import TableResult
 from natural_pdf.utils.locks import pdf_render_lock  # Import from utils instead
 from natural_pdf.utils.visualization import render_plain_page
@@ -274,6 +276,9 @@ class Page(
         self._load_elements()
         self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
+        # Flag to prevent infinite recursion when computing exclusions
+        self._computing_exclusions = False
     def _get_render_specs(
         self,
         mode: Literal["show", "render"] = "show",
@@ -411,6 +416,35 @@ class Page(
         self._exclusions = []
         return self
+    @contextlib.contextmanager
+    def without_exclusions(self):
+        """
+        Context manager that temporarily disables exclusion processing.
+        This prevents infinite recursion when exclusion callables themselves
+        use find() operations. While in this context, all find operations
+        will skip exclusion filtering.
+        Example:
+            ```python
+            # This exclusion would normally cause infinite recursion:
+            page.add_exclusion(lambda p: p.find("text:contains('Header')").expand())
+            # But internally, it's safe because we use:
+            with page.without_exclusions():
+                region = exclusion_callable(page)
+            ```
+        Yields:
+            The page object with exclusions temporarily disabled.
+        """
+        old_value = self._computing_exclusions
+        self._computing_exclusions = True
+        try:
+            yield self
+        finally:
+            self._computing_exclusions = old_value
     def add_exclusion(
         self,
         exclusion_func_or_region: Union[
@@ -758,15 +792,10 @@ class Page(
                     if debug:
                         print(f"  - Evaluating callable '{exclusion_label}'...")
-                    # Temporarily clear exclusions (consider if really needed)
-                    temp_original_exclusions = self._exclusions
-                    self._exclusions = []
-                    # Call the function - Expects it to return a Region or None
-                    region_result = exclusion_item(self)
-                    # Restore exclusions
-                    self._exclusions = temp_original_exclusions
+                    # Use context manager to prevent infinite recursion
+                    with self.without_exclusions():
+                        # Call the function - Expects it to return a Region or None
+                        region_result = exclusion_item(self)
                     if isinstance(region_result, Region):
                         # Assign the label to the returned region
@@ -866,26 +895,33 @@ class Page(
                 if debug:
                     print(f"  - Added direct region '{label}': {exclusion_item}")
-            # Process direct Element objects - convert to Region
+            # Process direct Element objects - only convert to Region if method is "region"
             elif hasattr(exclusion_item, "bbox") and hasattr(exclusion_item, "expand"):
-                try:
-                    # Convert Element to Region using expand()
-                    expanded_region = exclusion_item.expand()
-                    if isinstance(expanded_region, Region):
-                        expanded_region.label = label
-                        regions.append(expanded_region)
-                        if debug:
-                            print(
-                                f"  - Converted direct Element to Region '{label}': {expanded_region}"
-                            )
-                    else:
+                if method == "region":
+                    try:
+                        # Convert Element to Region using expand()
+                        expanded_region = exclusion_item.expand()
+                        if isinstance(expanded_region, Region):
+                            expanded_region.label = label
+                            regions.append(expanded_region)
+                            if debug:
+                                print(
+                                    f"  - Converted direct Element to Region '{label}': {expanded_region}"
+                                )
+                        else:
+                            if debug:
+                                print(
+                                    f"  - Element.expand() did not return a Region: {type(expanded_region)}"
+                                )
+                    except Exception as e:
                         if debug:
-                            print(
-                                f"  - Element.expand() did not return a Region: {type(expanded_region)}"
-                            )
-                except Exception as e:
+                            print(f"  - Failed to convert Element to Region: {e}")
+                else:
+                    # method == "element" - will be handled in _filter_elements_by_exclusions
                     if debug:
-                        print(f"  - Failed to convert Element to Region: {e}")
+                        print(
+                            f"  - Skipping element '{label}' (will be handled as element-based exclusion)"
+                        )
             # Process string selectors (from PDF-level exclusions)
             elif isinstance(exclusion_item, str):
@@ -939,6 +975,11 @@ class Page(
         Returns:
             A new list containing only the elements not excluded.
         """
+        # Skip exclusion filtering if we're currently computing exclusions
+        # This prevents infinite recursion when exclusion callables use find operations
+        if self._computing_exclusions:
+            return elements
         # Check both page-level and PDF-level exclusions
         has_page_exclusions = bool(self._exclusions)
         has_pdf_exclusions = (
@@ -1245,15 +1286,46 @@ class Page(
         Returns:
             ElementCollection of matching elements (unfiltered by exclusions)
         """
-        from natural_pdf.selectors.parser import selector_to_filter_func
+        from natural_pdf.selectors.parser import _calculate_aggregates, selector_to_filter_func
         # Handle compound OR selectors
         if selector_obj.get("type") == "or":
             # For OR selectors, search all elements and let the filter function decide
             elements_to_search = self._element_mgr.get_all_elements()
+            # Check if any sub-selector contains aggregate functions
+            has_aggregates = False
+            for sub_selector in selector_obj.get("selectors", []):
+                for attr in sub_selector.get("attributes", []):
+                    value = attr.get("value")
+                    if isinstance(value, dict) and value.get("type") == "aggregate":
+                        has_aggregates = True
+                        break
+                if has_aggregates:
+                    break
+            # Calculate aggregates if needed - for OR selectors we calculate on ALL elements
+            aggregates = {}
+            if has_aggregates:
+                # Need to calculate aggregates for each sub-selector type
+                for sub_selector in selector_obj.get("selectors", []):
+                    sub_type = sub_selector.get("type", "any").lower()
+                    if sub_type == "text":
+                        sub_elements = self._element_mgr.words
+                    elif sub_type == "rect":
+                        sub_elements = self._element_mgr.rects
+                    elif sub_type == "line":
+                        sub_elements = self._element_mgr.lines
+                    elif sub_type == "region":
+                        sub_elements = self._element_mgr.regions
+                    else:
+                        sub_elements = elements_to_search
+                    sub_aggregates = _calculate_aggregates(sub_elements, sub_selector)
+                    aggregates.update(sub_aggregates)
             # Create filter function from compound selector
-            filter_func = selector_to_filter_func(selector_obj, **kwargs)
+            filter_func = selector_to_filter_func(selector_obj, aggregates=aggregates, **kwargs)
             # Apply the filter to all elements
             matching_elements = [element for element in elements_to_search if filter_func(element)]
@@ -1309,8 +1381,23 @@ class Page(
         else:
             elements_to_search = self._element_mgr.get_all_elements()
+        # Check if selector contains aggregate functions
+        has_aggregates = False
+        for attr in selector_obj.get("attributes", []):
+            value = attr.get("value")
+            if isinstance(value, dict) and value.get("type") == "aggregate":
+                has_aggregates = True
+                break
+        # Calculate aggregates if needed
+        aggregates = {}
+        if has_aggregates:
+            # For aggregates, we need to calculate based on ALL elements of the same type
+            # not just the filtered subset
+            aggregates = _calculate_aggregates(elements_to_search, selector_obj)
         # Create filter function from selector, passing any additional parameters
-        filter_func = selector_to_filter_func(selector_obj, **kwargs)
+        filter_func = selector_to_filter_func(selector_obj, aggregates=aggregates, **kwargs)
         # Apply the filter to matching elements
         matching_elements = [element for element in elements_to_search if filter_func(element)]
@@ -1857,7 +1944,9 @@ class Page(
         cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
         show_progress: bool = False,
         content_filter=None,
-    ) -> List[List[Optional[str]]]:
+        verticals: Optional[List[float]] = None,
+        horizontals: Optional[List[float]] = None,
+    ) -> TableResult:
         """
         Extract the largest table from this page using enhanced region-based extraction.
@@ -1874,9 +1963,11 @@ class Page(
                 - A regex pattern string (characters matching the pattern are EXCLUDED)
                 - A callable that takes text and returns True to KEEP the character
                 - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
+            verticals: Optional list of x-coordinates for explicit vertical table lines.
+            horizontals: Optional list of y-coordinates for explicit horizontal table lines.
         Returns:
-            Table data as a list of rows, where each row is a list of cell values (str or None).
+            TableResult: A sequence-like object containing table rows that also provides .to_df() for pandas conversion.
         """
         # Create a full-page region and delegate to its enhanced extract_table method
         page_region = self.create_region(0, 0, self.width, self.height)
@@ -1889,6 +1980,8 @@ class Page(
             cell_extraction_func=cell_extraction_func,
             show_progress=show_progress,
             content_filter=content_filter,
+            verticals=verticals,
+            horizontals=horizontals,
         )
     def extract_tables(
@@ -2768,6 +2861,7 @@ class Page(
                             region.start_element = current_start_element
                             region.end_element = end_boundary_el  # Mark the element that ended it
                             region.is_end_next_start = True  # Mark how it ended
+                            region._boundary_exclusions = include_boundaries
                             regions.append(region)
                     else:  # horizontal
                         sec_left = (
@@ -2787,6 +2881,7 @@ class Page(
                             region.start_element = current_start_element
                             region.end_element = end_boundary_el  # Mark the element that ended it
                             region.is_end_next_start = True  # Mark how it ended
+                            region._boundary_exclusions = include_boundaries
                             regions.append(region)
                     active_section_started = False  # Reset for the new start
@@ -2815,6 +2910,7 @@ class Page(
                         region.start_element = current_start_element
                         region.end_element = end_boundary_el
                         region.is_end_next_start = False
+                        region._boundary_exclusions = include_boundaries
                         regions.append(region)
                 else:  # horizontal
                     sec_left = (
@@ -2834,6 +2930,7 @@ class Page(
                         region.start_element = current_start_element
                         region.end_element = end_boundary_el
                         region.is_end_next_start = False
+                        region._boundary_exclusions = include_boundaries
                         regions.append(region)
                 # Reset: section ended explicitly
@@ -2854,6 +2951,7 @@ class Page(
                     region.start_element = current_start_element
                     region.end_element = None  # Ended by page end
                     region.is_end_next_start = False
+                    region._boundary_exclusions = include_boundaries
                     regions.append(region)
             else:  # horizontal
                 sec_left = (
@@ -2867,6 +2965,7 @@ class Page(
                     region.start_element = current_start_element
                     region.end_element = None  # Ended by page end
                     region.is_end_next_start = False
+                    region._boundary_exclusions = include_boundaries
                     regions.append(region)
         return ElementCollection(regions)

natural_pdf/core/page_collection.py CHANGED Viewed

@@ -789,6 +789,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                                 start_element.page,
                                 (0, top, start_element.page.width, bottom),
                             )
+                            section._boundary_exclusions = include_boundaries
                         else:  # horizontal
                             left = start_element.x0
                             right = end_element.x1
@@ -821,6 +822,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                                 start_element.page,
                                 (left, 0, right, start_element.page.height),
                             )
+                            section._boundary_exclusions = include_boundaries
                         section.start_element = start_element
                         section.boundary_element_found = end_element
                     else:
@@ -865,6 +867,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                                 start_element.page, (0, top, start_element.page.width, bottom)
                             )
                             section.start_element = start_element
+                            section.end_element = (
+                                next_start  # The next start is the end of this section
+                            )
+                            section._boundary_exclusions = include_boundaries
                             sections.append(section)
                     else:  # horizontal
                         # Determine horizontal bounds
@@ -882,6 +888,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                                 start_element.page, (left, 0, right, start_element.page.height)
                             )
                             section.start_element = start_element
+                            section.end_element = (
+                                next_start  # The next start is the end of this section
+                            )
+                            section._boundary_exclusions = include_boundaries
                             sections.append(section)
                 else:
                     # Cross-page section - create from current_start to the end of its page
@@ -982,6 +992,71 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         return ElementCollection(sections)
+    def split(self, divider, **kwargs) -> "ElementCollection[Region]":
+        """
+        Divide this page collection into sections based on the provided divider elements.
+        Args:
+            divider: Elements or selector string that mark section boundaries
+            **kwargs: Additional parameters passed to get_sections()
+                - include_boundaries: How to include boundary elements (default: 'start')
+                - orientation: 'vertical' or 'horizontal' (default: 'vertical')
+                - new_section_on_page_break: Whether to split at page boundaries (default: False)
+        Returns:
+            ElementCollection of Region objects representing the sections
+        Example:
+            # Split a PDF by chapter titles
+            chapters = pdf.pages.split("text[size>20]:contains('CHAPTER')")
+            # Split by page breaks
+            page_sections = pdf.pages.split(None, new_section_on_page_break=True)
+            # Split multi-page document by section headers
+            sections = pdf.pages[10:20].split("text:bold:contains('Section')")
+        """
+        # Default to 'start' boundaries for split (include divider at start of each section)
+        if "include_boundaries" not in kwargs:
+            kwargs["include_boundaries"] = "start"
+        sections = self.get_sections(start_elements=divider, **kwargs)
+        # Add initial section if there's content before the first divider
+        if sections and divider is not None:
+            # Get all elements across all pages
+            all_elements = []
+            for page in self.pages:
+                all_elements.extend(page.get_elements())
+            if all_elements:
+                # Find first divider
+                if isinstance(divider, str):
+                    # Search for first matching element
+                    first_divider = None
+                    for page in self.pages:
+                        match = page.find(divider)
+                        if match:
+                            first_divider = match
+                            break
+                else:
+                    # divider is already elements
+                    first_divider = divider[0] if hasattr(divider, "__getitem__") else divider
+                if first_divider and all_elements[0] != first_divider:
+                    # There's content before the first divider
+                    # Get section from start to first divider
+                    initial_sections = self.get_sections(
+                        start_elements=None,
+                        end_elements=[first_divider],
+                        include_boundaries="none",
+                        orientation=kwargs.get("orientation", "vertical"),
+                    )
+                    if initial_sections:
+                        sections = ElementCollection([initial_sections[0]] + list(sections))
+        return sections
     def _gather_analysis_data(
         self,
         analysis_keys: List[str],

natural_pdf/core/pdf.py CHANGED Viewed

@@ -1333,6 +1333,39 @@ class PDF(
             orientation=orientation,
         )
+    def split(self, divider, **kwargs) -> "ElementCollection":
+        """
+        Divide the PDF into sections based on the provided divider elements.
+        Args:
+            divider: Elements or selector string that mark section boundaries
+            **kwargs: Additional parameters passed to get_sections()
+                - include_boundaries: How to include boundary elements (default: 'start')
+                - orientation: 'vertical' or 'horizontal' (default: 'vertical')
+                - new_section_on_page_break: Whether to split at page boundaries (default: False)
+        Returns:
+            ElementCollection of Region objects representing the sections
+        Example:
+            # Split a PDF by chapter titles
+            chapters = pdf.split("text[size>20]:contains('Chapter')")
+            # Export each chapter to a separate file
+            for i, chapter in enumerate(chapters):
+                chapter_text = chapter.extract_text()
+                with open(f"chapter_{i+1}.txt", "w") as f:
+                    f.write(chapter_text)
+            # Split by horizontal rules/lines
+            sections = pdf.split("line[orientation=horizontal]")
+            # Split only by page breaks (no divider elements)
+            pages = pdf.split(None, new_section_on_page_break=True)
+        """
+        # Delegate to pages collection
+        return self.pages.split(divider, **kwargs)
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
         """
         DEPRECATED: Use save_pdf(..., ocr=True) instead.

natural_pdf/describe/base.py CHANGED Viewed

@@ -272,17 +272,12 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
                 "font_family",
                 "font_variant",
                 "size",
-                "bold",
-                "italic",
-                "strike",
-                "underline",
-                "highlight",
+                "styles",
                 "source",
                 "confidence",
+                "color",
             ]
         )
-        # Add foreground text colour too
-        columns.append("color")
     elif element_type == "rect":
         columns = base_columns + ["width", "height", "stroke", "fill", "stroke_width"]
     elif element_type == "line":
@@ -358,6 +353,52 @@ def _extract_element_value(element: "Element", column: str) -> Any:
                 return str(col_val)
             return ""
+        elif column == "styles":
+            # Collect all active text decorations
+            styles = []
+            if getattr(element, "bold", False):
+                styles.append("bold")
+            if getattr(element, "italic", False):
+                styles.append("italic")
+            if getattr(element, "strike", False):
+                styles.append("strike")
+            if getattr(element, "underline", False):
+                styles.append("underline")
+            # Handle highlight specially - include color if not default yellow
+            if getattr(element, "is_highlighted", False):
+                highlight_color = getattr(element, "highlight_color", None)
+                if highlight_color is not None:
+                    # Convert color to hex if needed
+                    if isinstance(highlight_color, (tuple, list)) and len(highlight_color) >= 3:
+                        try:
+                            r, g, b = [
+                                int(v * 255) if v <= 1 else int(v) for v in highlight_color[:3]
+                            ]
+                            hex_color = f"#{r:02x}{g:02x}{b:02x}"
+                            styles.append(f"highlight({hex_color})")
+                        except Exception:
+                            styles.append("highlight")
+                    elif isinstance(highlight_color, (int, float)):
+                        # Grayscale value
+                        try:
+                            gray = (
+                                int(highlight_color * 255)
+                                if highlight_color <= 1
+                                else int(highlight_color)
+                            )
+                            hex_color = f"#{gray:02x}{gray:02x}{gray:02x}"
+                            styles.append(f"highlight({hex_color})")
+                        except Exception:
+                            styles.append("highlight")
+                    else:
+                        styles.append("highlight")
+                else:
+                    styles.append("highlight")
+            return ", ".join(styles) if styles else ""
         elif column in ["stroke", "fill", "color"]:
             value = getattr(element, column, None)
             # If already a string (e.g. '#ff00aa' or 'red') return as is

natural-pdf 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl

natural-pdf 0.2.16py3-none-any.whl → 0.2.18py3-none-any.whl