PyPI - natural-pdf - Versions diffs - 0.1.36__py3-none-any.whl → 0.1.37__py3-none-any.whl - Mend

natural-pdf 0.1.36py3-none-any.whl → 0.1.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

natural_pdf/analyzers/guides.py +1053 -26
natural_pdf/core/page.py +205 -45
natural_pdf/core/pdf.py +16 -1
natural_pdf/elements/collections.py +10 -0
natural_pdf/elements/region.py +106 -14
natural_pdf/elements/text.py +36 -2
natural_pdf/flows/region.py +128 -26
natural_pdf/selectors/parser.py +24 -0
natural_pdf/utils/layout.py +26 -0
natural_pdf/utils/text_extraction.py +76 -1
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/METADATA +1 -1
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/RECORD +16 -15
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -1221,7 +1221,7 @@ class Region(
             # Filter to elements in this region
             return [e for e in page_elements if self._is_element_in_region(e)]
-    def extract_text(self, apply_exclusions=True, debug=False, **kwargs) -> str:
+    def extract_text(self, apply_exclusions=True, debug=False, content_filter=None, **kwargs) -> str:
         """
         Extract text from this region, respecting page exclusions and using pdfplumber's
         layout engine (chars_to_textmap).
@@ -1229,6 +1229,10 @@ class Region(
         Args:
             apply_exclusions: Whether to apply exclusion regions defined on the parent page.
             debug: Enable verbose debugging output for filtering steps.
+            content_filter: Optional content filter to exclude specific text patterns. Can be:
+                - A regex pattern string (characters matching the pattern are EXCLUDED)
+                - A callable that takes text and returns True to KEEP the character
+                - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
             **kwargs: Additional layout parameters passed directly to pdfplumber's
                       `chars_to_textmap` function (e.g., layout, x_density, y_density).
                       See Page.extract_text docstring for more.
@@ -1285,10 +1289,15 @@ class Region(
         )
         # 5. Generate Text Layout using Utility
+        # Add content_filter to kwargs if provided
+        final_kwargs = kwargs.copy()
+        if content_filter is not None:
+            final_kwargs["content_filter"] = content_filter
         result = generate_text_layout(
             char_dicts=filtered_chars,
             layout_context_bbox=self.bbox,  # Use region's bbox for context
-            user_kwargs=kwargs,  # Pass original kwargs to layout generator
+            user_kwargs=final_kwargs,  # Pass kwargs including content_filter
         )
         logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
@@ -1304,6 +1313,7 @@ class Region(
         cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
         # --- NEW: Add tqdm control option --- #
         show_progress: bool = False,  # Controls progress bar for text method
+        content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,  # NEW: Content filtering
     ) -> TableResult:  # Return type allows Optional[str] for cells
         """
         Extract a table from this region.
@@ -1323,6 +1333,11 @@ class Region(
                                   and returns its string content. Overrides default text extraction
                                   for the 'text' method.
             show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
+            content_filter: Optional content filter to apply during cell text extraction. Can be:
+                - A regex pattern string (characters matching the pattern are EXCLUDED)
+                - A callable that takes text and returns True to KEEP the character
+                - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
+                Works with all extraction methods by filtering cell content.
         Returns:
             Table data as a list of rows, where each row is a list of cell values (str or None).
@@ -1358,7 +1373,7 @@ class Region(
                     logger.debug(
                         f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
                     )
-                    return TableResult(self._extract_table_from_cells(cell_regions_in_table))
+                    return TableResult(self._extract_table_from_cells(cell_regions_in_table, content_filter=content_filter))
                 # --------------------------------------------------------------- #
@@ -1439,14 +1454,15 @@ class Region(
         # Use the selected method
         if effective_method == "tatr":
-            table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
+            table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter)
         elif effective_method == "text":
             current_text_options = text_options.copy()
             current_text_options["cell_extraction_func"] = cell_extraction_func
             current_text_options["show_progress"] = show_progress
+            current_text_options["content_filter"] = content_filter
             table_rows = self._extract_table_text(**current_text_options)
         elif effective_method == "pdfplumber":
-            table_rows = self._extract_table_plumber(table_settings)
+            table_rows = self._extract_table_plumber(table_settings, content_filter=content_filter)
         else:
             raise ValueError(
                 f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
@@ -1603,13 +1619,14 @@ class Region(
         # Return the tables or an empty list if none found
         return tables if tables else []
-    def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
+    def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
         """
         Extract table using pdfplumber's table extraction.
         This method extracts the largest table within the region.
         Args:
             table_settings: Settings for pdfplumber table extraction
+            content_filter: Optional content filter to apply to cell values
         Returns:
             Table data as a list of rows, where each row is a list of cell values
@@ -1645,16 +1662,31 @@ class Region(
         # Return the table or an empty list if none found
         if table:
+            # Apply content filtering if provided
+            if content_filter is not None:
+                filtered_table = []
+                for row in table:
+                    filtered_row = []
+                    for cell in row:
+                        if cell is not None:
+                            # Apply content filter to cell text
+                            filtered_cell = self._apply_content_filter_to_text(cell, content_filter)
+                            filtered_row.append(filtered_cell)
+                        else:
+                            filtered_row.append(cell)
+                    filtered_table.append(filtered_row)
+                return filtered_table
             return table
         return []
-    def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
+    def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
         """
         Extract table using TATR structure detection.
         Args:
             use_ocr: Whether to apply OCR to each cell for better text extraction
             ocr_config: Optional OCR configuration parameters
+            content_filter: Optional content filter to apply to cell values
         Returns:
             Table data as a list of rows, where each row is a list of cell values
@@ -1734,7 +1766,10 @@ class Region(
                             continue
                 # Fallback to normal extraction
-                header_texts.append(header.extract_text().strip())
+                header_text = header.extract_text().strip()
+                if content_filter is not None:
+                    header_text = self._apply_content_filter_to_text(header_text, content_filter)
+                header_texts.append(header_text)
             table_data.append(header_texts)
         # Process rows
@@ -1767,6 +1802,8 @@ class Region(
                     # Fallback to normal extraction
                     cell_text = cell_region.extract_text().strip()
+                    if content_filter is not None:
+                        cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
                     row_cells.append(cell_text)
             else:
                 # No column information, just extract the whole row text
@@ -1780,7 +1817,10 @@ class Region(
                             continue
                 # Fallback to normal extraction
-                row_cells.append(row.extract_text().strip())
+                row_text = row.extract_text().strip()
+                if content_filter is not None:
+                    row_text = self._apply_content_filter_to_text(row_text, content_filter)
+                row_cells.append(row_text)
             table_data.append(row_cells)
@@ -1793,7 +1833,7 @@ class Region(
         Args:
             **text_options: Options passed to analyze_text_table_structure,
                           plus optional 'cell_extraction_func', 'coordinate_grouping_tolerance',
-                          and 'show_progress'.
+                          'show_progress', and 'content_filter'.
         Returns:
             Table data as list of lists of strings (or None for empty cells).
@@ -1801,6 +1841,8 @@ class Region(
         cell_extraction_func = text_options.pop("cell_extraction_func", None)
         # --- Get show_progress option --- #
         show_progress = text_options.pop("show_progress", False)
+        # --- Get content_filter option --- #
+        content_filter = text_options.pop("content_filter", None)
         # Analyze structure first (or use cached results)
         if "text_table_structure" in self.analyses:
@@ -1881,7 +1923,7 @@ class Region(
                         cell_value = None
                 else:
                     cell_value = cell_region.extract_text(
-                        layout=False, apply_exclusions=False
+                        layout=False, apply_exclusions=False, content_filter=content_filter
                     ).strip()
                 rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
@@ -3356,12 +3398,16 @@ class Region(
     # New helper: build table from pre-computed table_cell regions
     # ------------------------------------------------------------------
-    def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
+    def _extract_table_from_cells(self, cell_regions: List["Region"], content_filter=None) -> List[List[Optional[str]]]:
         """Construct a table (list-of-lists) from table_cell regions.
         This assumes each cell Region has metadata.row_index / col_index as written by
         detect_table_structure_from_lines().  If these keys are missing we will
         fall back to sorting by geometry.
+        Args:
+            cell_regions: List of table cell Region objects to extract text from
+            content_filter: Optional content filter to apply to cell text extraction
         """
         if not cell_regions:
             return []
@@ -3392,7 +3438,7 @@ class Region(
                 try:
                     r_idx = int(cell.metadata.get("row_index"))
                     c_idx = int(cell.metadata.get("col_index"))
-                    text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
+                    text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
                     table_grid[r_idx][c_idx] = text_val if text_val else None
                 except Exception as _err:
                     # Skip problematic cell
@@ -3439,7 +3485,53 @@ class Region(
             row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
             col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
-            text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
+            text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
             table_grid[row_idx][col_idx] = text_val if text_val else None
         return table_grid
+    def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
+        """
+        Apply content filter to a text string.
+        Args:
+            text: Input text string
+            content_filter: Content filter (regex, callable, or list of regexes)
+        Returns:
+            Filtered text string
+        """
+        if not text or content_filter is None:
+            return text
+        import re
+        if isinstance(content_filter, str):
+            # Single regex pattern - remove matching parts
+            try:
+                return re.sub(content_filter, '', text)
+            except re.error:
+                return text  # Invalid regex, return original
+        elif isinstance(content_filter, list):
+            # List of regex patterns - remove parts matching ANY pattern
+            try:
+                result = text
+                for pattern in content_filter:
+                    result = re.sub(pattern, '', result)
+                return result
+            except re.error:
+                return text  # Invalid regex, return original
+        elif callable(content_filter):
+            # Callable filter - apply to individual characters
+            try:
+                filtered_chars = []
+                for char in text:
+                    if content_filter(char):
+                        filtered_chars.append(char)
+                return ''.join(filtered_chars)
+            except Exception:
+                return text  # Function error, return original
+        return text

natural_pdf/elements/text.py CHANGED Viewed

@@ -230,7 +230,7 @@ class TextElement(Element):
         # Default to black
         return (0, 0, 0)
-    def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
+    def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs) -> str:
         """
         Extract text from this element.
@@ -238,14 +238,48 @@ class TextElement(Element):
             keep_blank_chars: Retained for API compatibility (unused).
             strip: If True (default) remove leading/trailing whitespace. Users may
                    pass ``strip=False`` to preserve whitespace exactly as stored.
+            content_filter: Optional content filter to exclude specific text patterns. Can be:
+                - A regex pattern string (characters matching the pattern are EXCLUDED)
+                - A callable that takes text and returns True to KEEP the character
+                - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
             **kwargs: Accepted for forward-compatibility and ignored here.
         Returns:
-            The text content, optionally stripped.
+            The text content, optionally stripped and filtered.
         """
         # Basic retrieval
         result = self.text or ""
+        # Apply content filtering if provided
+        if content_filter is not None and result:
+            import re
+            if isinstance(content_filter, str):
+                # Single regex pattern - remove matching characters
+                try:
+                    result = re.sub(content_filter, '', result)
+                except re.error:
+                    pass  # Invalid regex, skip filtering
+            elif isinstance(content_filter, list):
+                # List of regex patterns - remove characters matching ANY pattern
+                try:
+                    for pattern in content_filter:
+                        result = re.sub(pattern, '', result)
+                except re.error:
+                    pass  # Invalid regex, skip filtering
+            elif callable(content_filter):
+                # Callable filter - apply to individual characters
+                try:
+                    filtered_chars = []
+                    for char in result:
+                        if content_filter(char):
+                            filtered_chars.append(char)
+                    result = ''.join(filtered_chars)
+                except Exception:
+                    pass  # Function error, skip filtering
         # Apply optional stripping – align with global convention where simple
         # element extraction is stripped by default.
         if strip:

natural_pdf/flows/region.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import logging
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-from pdfplumber.utils.geometry import objects_to_bbox  # For calculating combined bbox
+from pdfplumber.utils.geometry import merge_bboxes  # Import merge_bboxes directly
 # For runtime image manipulation
 from PIL import Image as PIL_Image_Runtime
+from natural_pdf.tables import TableResult
 if TYPE_CHECKING:
     from PIL.Image import Image as PIL_Image  # For type hints
@@ -53,28 +55,46 @@ class FlowRegion:
         self.source_flow_element: "FlowElement" = source_flow_element
         self.boundary_element_found: Optional["PhysicalElement"] = boundary_element_found
+        # Add attributes for grid building, similar to Region
+        self.source: Optional[str] = None
+        self.region_type: Optional[str] = None
+        self.metadata: Dict[str, Any] = {}
         # Cache for expensive operations
         self._cached_text: Optional[str] = None
         self._cached_elements: Optional["ElementCollection"] = None  # Stringized
         self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
+    def __getattr__(self, name: str) -> Any:
+        """
+        Dynamically proxy attribute access to the source FlowElement if the
+        attribute is not found in this instance.
+        """
+        if name in self.__dict__:
+            return self.__dict__[name]
+        elif self.source_flow_element is not None:
+            return getattr(self.source_flow_element, name)
+        else:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
     @property
     def bbox(self) -> Optional[Tuple[float, float, float, float]]:
         """
-        Calculates a conceptual bounding box that encompasses all constituent physical regions.
-        This is the union of the bounding boxes of the constituent regions in their
-        original physical coordinates.
-        Returns None if there are no constituent regions.
+        The bounding box that encloses all constituent regions.
+        Calculated dynamically and cached.
         """
         if self._cached_bbox is not None:
             return self._cached_bbox
         if not self.constituent_regions:
             return None
-        # Use objects_to_bbox from pdfplumber.utils.geometry to merge bboxes
-        # This helper expects a list of objects that have .x0, .top, .x1, .bottom attributes.
-        # Our PhysicalRegion objects satisfy this.
-        self._cached_bbox = objects_to_bbox(self.constituent_regions)
+        # Use merge_bboxes from pdfplumber.utils.geometry to merge bboxes
+        # Extract bbox tuples from regions first
+        region_bboxes = [region.bbox for region in self.constituent_regions if hasattr(region, "bbox")]
+        if not region_bboxes:
+            return None
+        self._cached_bbox = merge_bboxes(region_bboxes)
         return self._cached_bbox
     @property
@@ -200,22 +220,72 @@ class FlowRegion:
         self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
     ) -> Optional["PhysicalElement"]:  # Stringized
         """
-        Finds the first physical element within this FlowRegion that matches the selector or text.
+        Find the first element in flow order that matches the selector or text.
+        This implementation iterates through the constituent regions *in the order
+        they appear in ``self.constituent_regions`` (i.e. document flow order),
+        delegating the search to each region's own ``find`` method.  It therefore
+        avoids constructing a huge intermediate ElementCollection and returns as
+        soon as a match is found, which is substantially faster and ensures that
+        selectors such as 'table' work exactly as they do on an individual
+        Region.
         """
-        # Uses self.elements() which respects exclusions if apply_exclusions=True by default
-        all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
-        return all_elems.find(selector=selector, text=text, **kwargs)  # ElementCollection.find
+        if not self.constituent_regions:
+            return None
+        for region in self.constituent_regions:
+            try:
+                result = region.find(selector=selector, text=text, **kwargs)
+                if result is not None:
+                    return result
+            except Exception as e:
+                logger.warning(
+                    f"FlowRegion.find: error searching region {region}: {e}",
+                    exc_info=False,
+                )
+        return None  # No match found
     def find_all(
         self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
     ) -> "ElementCollection":  # Stringized
         """
-        Finds all physical elements within this FlowRegion that match the selector or text.
+        Find **all** elements across the constituent regions that match the given
+        selector or text.
+        Rather than first materialising *every* element in the FlowRegion (which
+        can be extremely slow for multi-page flows), this implementation simply
+        chains each region's native ``find_all`` call and concatenates their
+        results into a single ElementCollection while preserving flow order.
         """
-        all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
-        return all_elems.find_all(
+        from natural_pdf.elements.collections import (
+            ElementCollection as RuntimeElementCollection,
+        )
+        matched_elements = []  # type: List["PhysicalElement"]
+        if not self.constituent_regions:
+            return RuntimeElementCollection([])
+        for region in self.constituent_regions:
+            try:
+                region_matches = region.find_all(
             selector=selector, text=text, **kwargs
-        )  # ElementCollection.find_all
+                )
+                if region_matches:
+                    # ``region_matches`` is an ElementCollection – extend with its
+                    # underlying list so we don't create nested collections.
+                    matched_elements.extend(
+                        region_matches.elements
+                        if hasattr(region_matches, "elements")
+                        else list(region_matches)
+                    )
+            except Exception as e:
+                logger.warning(
+                    f"FlowRegion.find_all: error searching region {region}: {e}",
+                    exc_info=False,
+                )
+        return RuntimeElementCollection(matched_elements)
     def highlight(
         self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs
@@ -253,6 +323,7 @@ class FlowRegion:
         stack_direction: str = "vertical",
         stack_gap: int = 5,
         stack_background_color: Tuple[int, int, int] = (255, 255, 255),
+        crop: bool = False,
         **kwargs,
     ) -> Optional["PIL_Image"]:
         """
@@ -269,6 +340,7 @@ class FlowRegion:
             stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
             stack_gap: Gap in pixels between stacked pages.
             stack_background_color: RGB background color for the stacked image.
+            crop: If True, crop each rendered page to the bounding box of constituent regions on that page.
             **kwargs: Additional arguments passed to the underlying rendering methods.
         Returns:
@@ -358,6 +430,16 @@ class FlowRegion:
             if not temp_highlights_for_page:
                 continue
+            # Calculate crop bbox if cropping is enabled
+            crop_bbox = None
+            if crop and constituent_regions_on_this_page:
+                # Calculate the bounding box that encompasses all constituent regions on this page
+                min_x0 = min(region.bbox[0] for region in constituent_regions_on_this_page)
+                min_y0 = min(region.bbox[1] for region in constituent_regions_on_this_page)
+                max_x1 = max(region.bbox[2] for region in constituent_regions_on_this_page)
+                max_y1 = max(region.bbox[3] for region in constituent_regions_on_this_page)
+                crop_bbox = (min_x0, min_y0, max_x1, max_y1)
             page_image = highlighter_service.render_preview(
                 page_index=(
                     page_obj.index
@@ -369,6 +451,7 @@ class FlowRegion:
                 width=width,
                 labels=labels,  # Pass through labels
                 legend_position=legend_position,
+                crop_bbox=crop_bbox,
                 **kwargs,
             )
             if page_image:
@@ -549,7 +632,7 @@ class FlowRegion:
         cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
         show_progress: bool = False,
         **kwargs,
-    ) -> List[List[Optional[str]]]:
+    ) -> TableResult:
         """Extracts a single logical table from the FlowRegion.
         This is a convenience wrapper that iterates through the constituent
@@ -565,9 +648,9 @@ class FlowRegion:
                 ``Region.extract_table`` implementation.
         Returns:
-            A list of rows (``List[List[Optional[str]]]``).  Rows returned from
+            A TableResult object containing the aggregated table data.  Rows returned from
             consecutive constituent regions are appended in document order.  If
-            no tables are detected in any region, an empty list is returned.
+            no tables are detected in any region, an empty TableResult is returned.
         """
         if table_settings is None:
@@ -576,13 +659,13 @@ class FlowRegion:
             text_options = {}
         if not self.constituent_regions:
-            return []
+            return TableResult([])
         aggregated_rows: List[List[Optional[str]]] = []
         for region in self.constituent_regions:
             try:
-                region_rows = region.extract_table(
+                region_result = region.extract_table(
                     method=method,
                     table_settings=table_settings.copy(),  # Avoid side-effects
                     use_ocr=use_ocr,
@@ -593,16 +676,16 @@ class FlowRegion:
                     **kwargs,
                 )
-                # ``region_rows`` can legitimately be [] if no table found.
-                if region_rows:
-                    aggregated_rows.extend(region_rows)
+                # region_result is now a TableResult object, extract the rows
+                if region_result:
+                    aggregated_rows.extend(region_result)
             except Exception as e:
                 logger.error(
                     f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
                     exc_info=True,
                 )
-        return aggregated_rows
+        return TableResult(aggregated_rows)
     def extract_tables(
         self,
@@ -649,3 +732,22 @@ class FlowRegion:
                 )
         return all_tables
+    @property
+    def normalized_type(self) -> Optional[str]:
+        """
+        Return the normalized type for selector compatibility.
+        This allows FlowRegion to be found by selectors like 'table'.
+        """
+        if self.region_type:
+            # Convert region_type to normalized format (replace spaces with underscores, lowercase)
+            return self.region_type.lower().replace(" ", "_")
+        return None
+    @property
+    def type(self) -> Optional[str]:
+        """
+        Return the type attribute for selector compatibility.
+        This is an alias for normalized_type.
+        """
+        return self.normalized_type

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -24,6 +24,7 @@ This enables powerful document navigation like:
 - page.find('text[size>12]:bold:contains("Summary")')
 - page.find_all('rect[color~="red"]:above(text:contains("Total"))')
 - page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
+- page.find('text:regex("[\u2500-\u257F]")')  # Box drawing characters
 """
 import ast
@@ -748,6 +749,29 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
             filter_lambda = contains_check
+        # --- Handle :regex pseudo-class (same as :contains with regex=True) ---
+        elif name == "regex" and args is not None:
+            ignore_case = not kwargs.get("case", True)  # Default case sensitive
+            filter_name = f"pseudo-class :regex({args!r}, ignore_case={ignore_case})"
+            def regex_check(element, args=args, ignore_case=ignore_case):
+                if not hasattr(element, "text") or not element.text:
+                    return False  # Element must have non-empty text
+                element_text = element.text
+                search_term = str(args)  # Ensure args is string
+                try:
+                    pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
+                    return bool(pattern.search(element_text))
+                except re.error as e:
+                    logger.warning(
+                        f"Invalid regex '{search_term}' in :regex selector: {e}. Returning False."
+                    )
+                    return False
+            filter_lambda = regex_check
         # --- Handle :startswith and :starts-with (alias) --- #
         elif name in ("starts-with", "startswith") and args is not None:
             filter_name = f"pseudo-class :{name}({args!r})"

natural_pdf/utils/layout.py ADDED Viewed

@@ -0,0 +1,26 @@
+from typing import List, Optional, Tuple
+def merge_bboxes(
+    bboxes: List[Optional[Tuple[float, float, float, float]]]
+) -> Optional[Tuple[float, float, float, float]]:
+    """
+    Merge multiple bounding boxes into a single one that encompasses all of them.
+    Args:
+        bboxes: A list of bbox tuples (x0, top, x1, bottom). Can contain None values.
+    Returns:
+        A single merged bbox tuple, or None if no valid bboxes are provided.
+    """
+    if not bboxes:
+        return None
+    # Filter out None or invalid bboxes
+    valid_bboxes = [b for b in bboxes if b and len(b) == 4]
+    if not valid_bboxes:
+        return None
+    x0s, tops, x1s, bottoms = zip(*valid_bboxes)
+    return (min(x0s), min(tops), max(x1s), max(bottoms))

natural-pdf 0.1.36__py3-none-any.whl → 0.1.37__py3-none-any.whl

natural-pdf 0.1.36py3-none-any.whl → 0.1.37py3-none-any.whl