PyPI - natural-pdf - Versions diffs - 0.1.36__py3-none-any.whl → 0.1.38__py3-none-any.whl - Mend

natural-pdf 0.1.36py3-none-any.whl → 0.1.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

natural_pdf/analyzers/guides.py +1053 -26
natural_pdf/core/page.py +274 -46
natural_pdf/core/pdf.py +116 -30
natural_pdf/elements/collections.py +48 -7
natural_pdf/elements/region.py +179 -17
natural_pdf/elements/text.py +36 -2
natural_pdf/flows/region.py +128 -26
natural_pdf/selectors/parser.py +24 -0
natural_pdf/utils/layout.py +26 -0
natural_pdf/utils/text_extraction.py +76 -1
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/METADATA +1 -1
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/RECORD +16 -15
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -1221,7 +1221,7 @@ class Region(
             # Filter to elements in this region
             return [e for e in page_elements if self._is_element_in_region(e)]
-    def extract_text(self, apply_exclusions=True, debug=False, **kwargs) -> str:
+    def extract_text(self, apply_exclusions=True, debug=False, content_filter=None, **kwargs) -> str:
         """
         Extract text from this region, respecting page exclusions and using pdfplumber's
         layout engine (chars_to_textmap).
@@ -1229,6 +1229,10 @@ class Region(
         Args:
             apply_exclusions: Whether to apply exclusion regions defined on the parent page.
             debug: Enable verbose debugging output for filtering steps.
+            content_filter: Optional content filter to exclude specific text patterns. Can be:
+                - A regex pattern string (characters matching the pattern are EXCLUDED)
+                - A callable that takes text and returns True to KEEP the character
+                - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
             **kwargs: Additional layout parameters passed directly to pdfplumber's
                       `chars_to_textmap` function (e.g., layout, x_density, y_density).
                       See Page.extract_text docstring for more.
@@ -1285,10 +1289,15 @@ class Region(
         )
         # 5. Generate Text Layout using Utility
+        # Add content_filter to kwargs if provided
+        final_kwargs = kwargs.copy()
+        if content_filter is not None:
+            final_kwargs["content_filter"] = content_filter
         result = generate_text_layout(
             char_dicts=filtered_chars,
             layout_context_bbox=self.bbox,  # Use region's bbox for context
-            user_kwargs=kwargs,  # Pass original kwargs to layout generator
+            user_kwargs=final_kwargs,  # Pass kwargs including content_filter
         )
         logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
@@ -1304,6 +1313,7 @@ class Region(
         cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
         # --- NEW: Add tqdm control option --- #
         show_progress: bool = False,  # Controls progress bar for text method
+        content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,  # NEW: Content filtering
     ) -> TableResult:  # Return type allows Optional[str] for cells
         """
         Extract a table from this region.
@@ -1323,6 +1333,11 @@ class Region(
                                   and returns its string content. Overrides default text extraction
                                   for the 'text' method.
             show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
+            content_filter: Optional content filter to apply during cell text extraction. Can be:
+                - A regex pattern string (characters matching the pattern are EXCLUDED)
+                - A callable that takes text and returns True to KEEP the character
+                - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
+                Works with all extraction methods by filtering cell content.
         Returns:
             Table data as a list of rows, where each row is a list of cell values (str or None).
@@ -1358,7 +1373,7 @@ class Region(
                     logger.debug(
                         f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
                     )
-                    return TableResult(self._extract_table_from_cells(cell_regions_in_table))
+                    return TableResult(self._extract_table_from_cells(cell_regions_in_table, content_filter=content_filter))
                 # --------------------------------------------------------------- #
@@ -1439,14 +1454,15 @@ class Region(
         # Use the selected method
         if effective_method == "tatr":
-            table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
+            table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter)
         elif effective_method == "text":
             current_text_options = text_options.copy()
             current_text_options["cell_extraction_func"] = cell_extraction_func
             current_text_options["show_progress"] = show_progress
+            current_text_options["content_filter"] = content_filter
             table_rows = self._extract_table_text(**current_text_options)
         elif effective_method == "pdfplumber":
-            table_rows = self._extract_table_plumber(table_settings)
+            table_rows = self._extract_table_plumber(table_settings, content_filter=content_filter)
         else:
             raise ValueError(
                 f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
@@ -1600,16 +1616,35 @@ class Region(
         # Extract all tables from the cropped area
         tables = cropped.extract_tables(table_settings)
-        # Return the tables or an empty list if none found
-        return tables if tables else []
+        # Apply RTL text processing to all tables
+        if tables:
+            processed_tables = []
+            for table in tables:
+                processed_table = []
+                for row in table:
+                    processed_row = []
+                    for cell in row:
+                        if cell is not None:
+                            # Apply RTL text processing to each cell
+                            rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
+                            processed_row.append(rtl_processed_cell)
+                        else:
+                            processed_row.append(cell)
+                    processed_table.append(processed_row)
+                processed_tables.append(processed_table)
+            return processed_tables
+        # Return empty list if no tables found
+        return []
-    def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
+    def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
         """
         Extract table using pdfplumber's table extraction.
         This method extracts the largest table within the region.
         Args:
             table_settings: Settings for pdfplumber table extraction
+            content_filter: Optional content filter to apply to cell values
         Returns:
             Table data as a list of rows, where each row is a list of cell values
@@ -1645,16 +1680,35 @@ class Region(
         # Return the table or an empty list if none found
         if table:
-            return table
+            # Apply RTL text processing and content filtering if provided
+            processed_table = []
+            for row in table:
+                processed_row = []
+                for cell in row:
+                    if cell is not None:
+                        # Apply RTL text processing first
+                        rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
+                        # Then apply content filter if provided
+                        if content_filter is not None:
+                            filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
+                            processed_row.append(filtered_cell)
+                        else:
+                            processed_row.append(rtl_processed_cell)
+                    else:
+                        processed_row.append(cell)
+                processed_table.append(processed_row)
+            return processed_table
         return []
-    def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
+    def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
         """
         Extract table using TATR structure detection.
         Args:
             use_ocr: Whether to apply OCR to each cell for better text extraction
             ocr_config: Optional OCR configuration parameters
+            content_filter: Optional content filter to apply to cell values
         Returns:
             Table data as a list of rows, where each row is a list of cell values
@@ -1734,7 +1788,10 @@ class Region(
                             continue
                 # Fallback to normal extraction
-                header_texts.append(header.extract_text().strip())
+                header_text = header.extract_text().strip()
+                if content_filter is not None:
+                    header_text = self._apply_content_filter_to_text(header_text, content_filter)
+                header_texts.append(header_text)
             table_data.append(header_texts)
         # Process rows
@@ -1767,6 +1824,8 @@ class Region(
                     # Fallback to normal extraction
                     cell_text = cell_region.extract_text().strip()
+                    if content_filter is not None:
+                        cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
                     row_cells.append(cell_text)
             else:
                 # No column information, just extract the whole row text
@@ -1780,7 +1839,10 @@ class Region(
                             continue
                 # Fallback to normal extraction
-                row_cells.append(row.extract_text().strip())
+                row_text = row.extract_text().strip()
+                if content_filter is not None:
+                    row_text = self._apply_content_filter_to_text(row_text, content_filter)
+                row_cells.append(row_text)
             table_data.append(row_cells)
@@ -1793,7 +1855,7 @@ class Region(
         Args:
             **text_options: Options passed to analyze_text_table_structure,
                           plus optional 'cell_extraction_func', 'coordinate_grouping_tolerance',
-                          and 'show_progress'.
+                          'show_progress', and 'content_filter'.
         Returns:
             Table data as list of lists of strings (or None for empty cells).
@@ -1801,6 +1863,8 @@ class Region(
         cell_extraction_func = text_options.pop("cell_extraction_func", None)
         # --- Get show_progress option --- #
         show_progress = text_options.pop("show_progress", False)
+        # --- Get content_filter option --- #
+        content_filter = text_options.pop("content_filter", None)
         # Analyze structure first (or use cached results)
         if "text_table_structure" in self.analyses:
@@ -1881,7 +1945,7 @@ class Region(
                         cell_value = None
                 else:
                     cell_value = cell_region.extract_text(
-                        layout=False, apply_exclusions=False
+                        layout=False, apply_exclusions=False, content_filter=content_filter
                     ).strip()
                 rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
@@ -3356,12 +3420,16 @@ class Region(
     # New helper: build table from pre-computed table_cell regions
     # ------------------------------------------------------------------
-    def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
+    def _extract_table_from_cells(self, cell_regions: List["Region"], content_filter=None) -> List[List[Optional[str]]]:
         """Construct a table (list-of-lists) from table_cell regions.
         This assumes each cell Region has metadata.row_index / col_index as written by
         detect_table_structure_from_lines().  If these keys are missing we will
         fall back to sorting by geometry.
+        Args:
+            cell_regions: List of table cell Region objects to extract text from
+            content_filter: Optional content filter to apply to cell text extraction
         """
         if not cell_regions:
             return []
@@ -3392,7 +3460,7 @@ class Region(
                 try:
                     r_idx = int(cell.metadata.get("row_index"))
                     c_idx = int(cell.metadata.get("col_index"))
-                    text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
+                    text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
                     table_grid[r_idx][c_idx] = text_val if text_val else None
                 except Exception as _err:
                     # Skip problematic cell
@@ -3439,7 +3507,101 @@ class Region(
             row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
             col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
-            text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
+            text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
             table_grid[row_idx][col_idx] = text_val if text_val else None
         return table_grid
+    def _apply_rtl_processing_to_text(self, text: str) -> str:
+        """
+        Apply RTL (Right-to-Left) text processing to a string.
+        This converts visual order text (as stored in PDFs) to logical order
+        for proper display of Arabic, Hebrew, and other RTL scripts.
+        Args:
+            text: Input text string in visual order
+        Returns:
+            Text string in logical order
+        """
+        if not text or not text.strip():
+            return text
+        # Quick check for RTL characters - if none found, return as-is
+        import unicodedata
+        def _contains_rtl(s):
+            return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
+        if not _contains_rtl(text):
+            return text
+        try:
+            from bidi.algorithm import get_display  # type: ignore
+            from natural_pdf.utils.bidi_mirror import mirror_brackets
+            # Apply BiDi algorithm to convert from visual to logical order
+            # Process line by line to handle mixed content properly
+            processed_lines = []
+            for line in text.split("\n"):
+                if line.strip():
+                    # Determine base direction for this line
+                    base_dir = "R" if _contains_rtl(line) else "L"
+                    logical_line = get_display(line, base_dir=base_dir)
+                    # Apply bracket mirroring for correct logical order
+                    processed_lines.append(mirror_brackets(logical_line))
+                else:
+                    processed_lines.append(line)
+            return "\n".join(processed_lines)
+        except (ImportError, Exception):
+            # If bidi library is not available or fails, return original text
+            return text
+    def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
+        """
+        Apply content filter to a text string.
+        Args:
+            text: Input text string
+            content_filter: Content filter (regex, callable, or list of regexes)
+        Returns:
+            Filtered text string
+        """
+        if not text or content_filter is None:
+            return text
+        import re
+        if isinstance(content_filter, str):
+            # Single regex pattern - remove matching parts
+            try:
+                return re.sub(content_filter, '', text)
+            except re.error:
+                return text  # Invalid regex, return original
+        elif isinstance(content_filter, list):
+            # List of regex patterns - remove parts matching ANY pattern
+            try:
+                result = text
+                for pattern in content_filter:
+                    result = re.sub(pattern, '', result)
+                return result
+            except re.error:
+                return text  # Invalid regex, return original
+        elif callable(content_filter):
+            # Callable filter - apply to individual characters
+            try:
+                filtered_chars = []
+                for char in text:
+                    if content_filter(char):
+                        filtered_chars.append(char)
+                return ''.join(filtered_chars)
+            except Exception:
+                return text  # Function error, return original
+        return text

natural_pdf/elements/text.py CHANGED Viewed

@@ -230,7 +230,7 @@ class TextElement(Element):
         # Default to black
         return (0, 0, 0)
-    def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
+    def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs) -> str:
         """
         Extract text from this element.
@@ -238,14 +238,48 @@ class TextElement(Element):
             keep_blank_chars: Retained for API compatibility (unused).
             strip: If True (default) remove leading/trailing whitespace. Users may
                    pass ``strip=False`` to preserve whitespace exactly as stored.
+            content_filter: Optional content filter to exclude specific text patterns. Can be:
+                - A regex pattern string (characters matching the pattern are EXCLUDED)
+                - A callable that takes text and returns True to KEEP the character
+                - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
             **kwargs: Accepted for forward-compatibility and ignored here.
         Returns:
-            The text content, optionally stripped.
+            The text content, optionally stripped and filtered.
         """
         # Basic retrieval
         result = self.text or ""
+        # Apply content filtering if provided
+        if content_filter is not None and result:
+            import re
+            if isinstance(content_filter, str):
+                # Single regex pattern - remove matching characters
+                try:
+                    result = re.sub(content_filter, '', result)
+                except re.error:
+                    pass  # Invalid regex, skip filtering
+            elif isinstance(content_filter, list):
+                # List of regex patterns - remove characters matching ANY pattern
+                try:
+                    for pattern in content_filter:
+                        result = re.sub(pattern, '', result)
+                except re.error:
+                    pass  # Invalid regex, skip filtering
+            elif callable(content_filter):
+                # Callable filter - apply to individual characters
+                try:
+                    filtered_chars = []
+                    for char in result:
+                        if content_filter(char):
+                            filtered_chars.append(char)
+                    result = ''.join(filtered_chars)
+                except Exception:
+                    pass  # Function error, skip filtering
         # Apply optional stripping – align with global convention where simple
         # element extraction is stripped by default.
         if strip:

natural_pdf/flows/region.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import logging
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-from pdfplumber.utils.geometry import objects_to_bbox  # For calculating combined bbox
+from pdfplumber.utils.geometry import merge_bboxes  # Import merge_bboxes directly
 # For runtime image manipulation
 from PIL import Image as PIL_Image_Runtime
+from natural_pdf.tables import TableResult
 if TYPE_CHECKING:
     from PIL.Image import Image as PIL_Image  # For type hints
@@ -53,28 +55,46 @@ class FlowRegion:
         self.source_flow_element: "FlowElement" = source_flow_element
         self.boundary_element_found: Optional["PhysicalElement"] = boundary_element_found
+        # Add attributes for grid building, similar to Region
+        self.source: Optional[str] = None
+        self.region_type: Optional[str] = None
+        self.metadata: Dict[str, Any] = {}
         # Cache for expensive operations
         self._cached_text: Optional[str] = None
         self._cached_elements: Optional["ElementCollection"] = None  # Stringized
         self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
+    def __getattr__(self, name: str) -> Any:
+        """
+        Dynamically proxy attribute access to the source FlowElement if the
+        attribute is not found in this instance.
+        """
+        if name in self.__dict__:
+            return self.__dict__[name]
+        elif self.source_flow_element is not None:
+            return getattr(self.source_flow_element, name)
+        else:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
     @property
     def bbox(self) -> Optional[Tuple[float, float, float, float]]:
         """
-        Calculates a conceptual bounding box that encompasses all constituent physical regions.
-        This is the union of the bounding boxes of the constituent regions in their
-        original physical coordinates.
-        Returns None if there are no constituent regions.
+        The bounding box that encloses all constituent regions.
+        Calculated dynamically and cached.
         """
         if self._cached_bbox is not None:
             return self._cached_bbox
         if not self.constituent_regions:
             return None
-        # Use objects_to_bbox from pdfplumber.utils.geometry to merge bboxes
-        # This helper expects a list of objects that have .x0, .top, .x1, .bottom attributes.
-        # Our PhysicalRegion objects satisfy this.
-        self._cached_bbox = objects_to_bbox(self.constituent_regions)
+        # Use merge_bboxes from pdfplumber.utils.geometry to merge bboxes
+        # Extract bbox tuples from regions first
+        region_bboxes = [region.bbox for region in self.constituent_regions if hasattr(region, "bbox")]
+        if not region_bboxes:
+            return None
+        self._cached_bbox = merge_bboxes(region_bboxes)
         return self._cached_bbox
     @property
@@ -200,22 +220,72 @@ class FlowRegion:
         self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
     ) -> Optional["PhysicalElement"]:  # Stringized
         """
-        Finds the first physical element within this FlowRegion that matches the selector or text.
+        Find the first element in flow order that matches the selector or text.
+        This implementation iterates through the constituent regions *in the order
+        they appear in ``self.constituent_regions`` (i.e. document flow order),
+        delegating the search to each region's own ``find`` method.  It therefore
+        avoids constructing a huge intermediate ElementCollection and returns as
+        soon as a match is found, which is substantially faster and ensures that
+        selectors such as 'table' work exactly as they do on an individual
+        Region.
         """
-        # Uses self.elements() which respects exclusions if apply_exclusions=True by default
-        all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
-        return all_elems.find(selector=selector, text=text, **kwargs)  # ElementCollection.find
+        if not self.constituent_regions:
+            return None
+        for region in self.constituent_regions:
+            try:
+                result = region.find(selector=selector, text=text, **kwargs)
+                if result is not None:
+                    return result
+            except Exception as e:
+                logger.warning(
+                    f"FlowRegion.find: error searching region {region}: {e}",
+                    exc_info=False,
+                )
+        return None  # No match found
     def find_all(
         self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
     ) -> "ElementCollection":  # Stringized
         """
-        Finds all physical elements within this FlowRegion that match the selector or text.
+        Find **all** elements across the constituent regions that match the given
+        selector or text.
+        Rather than first materialising *every* element in the FlowRegion (which
+        can be extremely slow for multi-page flows), this implementation simply
+        chains each region's native ``find_all`` call and concatenates their
+        results into a single ElementCollection while preserving flow order.
         """
-        all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
-        return all_elems.find_all(
+        from natural_pdf.elements.collections import (
+            ElementCollection as RuntimeElementCollection,
+        )
+        matched_elements = []  # type: List["PhysicalElement"]
+        if not self.constituent_regions:
+            return RuntimeElementCollection([])
+        for region in self.constituent_regions:
+            try:
+                region_matches = region.find_all(
             selector=selector, text=text, **kwargs
-        )  # ElementCollection.find_all
+                )
+                if region_matches:
+                    # ``region_matches`` is an ElementCollection – extend with its
+                    # underlying list so we don't create nested collections.
+                    matched_elements.extend(
+                        region_matches.elements
+                        if hasattr(region_matches, "elements")
+                        else list(region_matches)
+                    )
+            except Exception as e:
+                logger.warning(
+                    f"FlowRegion.find_all: error searching region {region}: {e}",
+                    exc_info=False,
+                )
+        return RuntimeElementCollection(matched_elements)
     def highlight(
         self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs
@@ -253,6 +323,7 @@ class FlowRegion:
         stack_direction: str = "vertical",
         stack_gap: int = 5,
         stack_background_color: Tuple[int, int, int] = (255, 255, 255),
+        crop: bool = False,
         **kwargs,
     ) -> Optional["PIL_Image"]:
         """
@@ -269,6 +340,7 @@ class FlowRegion:
             stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
             stack_gap: Gap in pixels between stacked pages.
             stack_background_color: RGB background color for the stacked image.
+            crop: If True, crop each rendered page to the bounding box of constituent regions on that page.
             **kwargs: Additional arguments passed to the underlying rendering methods.
         Returns:
@@ -358,6 +430,16 @@ class FlowRegion:
             if not temp_highlights_for_page:
                 continue
+            # Calculate crop bbox if cropping is enabled
+            crop_bbox = None
+            if crop and constituent_regions_on_this_page:
+                # Calculate the bounding box that encompasses all constituent regions on this page
+                min_x0 = min(region.bbox[0] for region in constituent_regions_on_this_page)
+                min_y0 = min(region.bbox[1] for region in constituent_regions_on_this_page)
+                max_x1 = max(region.bbox[2] for region in constituent_regions_on_this_page)
+                max_y1 = max(region.bbox[3] for region in constituent_regions_on_this_page)
+                crop_bbox = (min_x0, min_y0, max_x1, max_y1)
             page_image = highlighter_service.render_preview(
                 page_index=(
                     page_obj.index
@@ -369,6 +451,7 @@ class FlowRegion:
                 width=width,
                 labels=labels,  # Pass through labels
                 legend_position=legend_position,
+                crop_bbox=crop_bbox,
                 **kwargs,
             )
             if page_image:
@@ -549,7 +632,7 @@ class FlowRegion:
         cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
         show_progress: bool = False,
         **kwargs,
-    ) -> List[List[Optional[str]]]:
+    ) -> TableResult:
         """Extracts a single logical table from the FlowRegion.
         This is a convenience wrapper that iterates through the constituent
@@ -565,9 +648,9 @@ class FlowRegion:
                 ``Region.extract_table`` implementation.
         Returns:
-            A list of rows (``List[List[Optional[str]]]``).  Rows returned from
+            A TableResult object containing the aggregated table data.  Rows returned from
             consecutive constituent regions are appended in document order.  If
-            no tables are detected in any region, an empty list is returned.
+            no tables are detected in any region, an empty TableResult is returned.
         """
         if table_settings is None:
@@ -576,13 +659,13 @@ class FlowRegion:
             text_options = {}
         if not self.constituent_regions:
-            return []
+            return TableResult([])
         aggregated_rows: List[List[Optional[str]]] = []
         for region in self.constituent_regions:
             try:
-                region_rows = region.extract_table(
+                region_result = region.extract_table(
                     method=method,
                     table_settings=table_settings.copy(),  # Avoid side-effects
                     use_ocr=use_ocr,
@@ -593,16 +676,16 @@ class FlowRegion:
                     **kwargs,
                 )
-                # ``region_rows`` can legitimately be [] if no table found.
-                if region_rows:
-                    aggregated_rows.extend(region_rows)
+                # region_result is now a TableResult object, extract the rows
+                if region_result:
+                    aggregated_rows.extend(region_result)
             except Exception as e:
                 logger.error(
                     f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
                     exc_info=True,
                 )
-        return aggregated_rows
+        return TableResult(aggregated_rows)
     def extract_tables(
         self,
@@ -649,3 +732,22 @@ class FlowRegion:
                 )
         return all_tables
+    @property
+    def normalized_type(self) -> Optional[str]:
+        """
+        Return the normalized type for selector compatibility.
+        This allows FlowRegion to be found by selectors like 'table'.
+        """
+        if self.region_type:
+            # Convert region_type to normalized format (replace spaces with underscores, lowercase)
+            return self.region_type.lower().replace(" ", "_")
+        return None
+    @property
+    def type(self) -> Optional[str]:
+        """
+        Return the type attribute for selector compatibility.
+        This is an alias for normalized_type.
+        """
+        return self.normalized_type

natural-pdf 0.1.36__py3-none-any.whl → 0.1.38__py3-none-any.whl

natural-pdf 0.1.36py3-none-any.whl → 0.1.38py3-none-any.whl