PyPI - natural-pdf - Versions diffs - 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl - Mend

natural-pdf 0.1.37py3-none-any.whl → 0.1.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

natural_pdf/__init__.py +6 -0
natural_pdf/core/page.py +90 -22
natural_pdf/core/pdf.py +183 -59
natural_pdf/elements/collections.py +202 -47
natural_pdf/elements/region.py +176 -56
natural_pdf/flows/element.py +25 -0
natural_pdf/flows/flow.py +702 -20
natural_pdf/flows/region.py +52 -4
natural_pdf/selectors/parser.py +34 -1
natural_pdf/text_mixin.py +97 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -21,6 +21,7 @@ from natural_pdf.elements.text import TextElement  # ADDED IMPORT
 from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import utility
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
+from natural_pdf.text_mixin import TextMixin
 # ------------------------------------------------------------------
 # Table utilities
@@ -56,7 +57,12 @@ logger = logging.getLogger(__name__)
 class Region(
-    DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin
+    TextMixin,
+    DirectionalMixin,
+    ClassificationMixin,
+    ExtractionMixin,
+    ShapeDetectionMixin,
+    DescribeMixin,
 ):
     """Represents a rectangular region on a page.
@@ -1610,14 +1616,71 @@ class Region(
             table_settings.setdefault("join_x_tolerance", join)
             table_settings.setdefault("join_y_tolerance", join)
-        # Create a crop of the page for this region
-        cropped = self.page._page.crop(self.bbox)
+        # -------------------------------------------------------------
+        # Apply char-level exclusion filtering, if any exclusions are
+        # defined on the parent Page.  We create a lightweight
+        # pdfplumber.Page copy whose .chars list omits characters that
+        # fall inside any exclusion Region.  Other object types are
+        # left untouched for now ("chars-only" strategy).
+        # -------------------------------------------------------------
+        base_plumber_page = self.page._page
+        if getattr(self.page, "_exclusions", None):
+            # Resolve exclusion Regions (callables already evaluated)
+            exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
+            def _keep_char(obj):
+                """Return True if pdfplumber obj should be kept."""
+                if obj.get("object_type") != "char":
+                    # Keep non-char objects unchanged – lattice grids etc.
+                    return True
+                # Compute character centre point
+                cx = (obj["x0"] + obj["x1"]) / 2.0
+                cy = (obj["top"] + obj["bottom"]) / 2.0
+                # Reject if the centre lies inside ANY exclusion Region
+                for reg in exclusion_regions:
+                    if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
+                        return False
+                return True
+            try:
+                filtered_page = base_plumber_page.filter(_keep_char)
+            except Exception as _filter_err:
+                # Fallback – if filtering fails, log and proceed unfiltered
+                logger.warning(
+                    f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions: {_filter_err}"
+                )
+                filtered_page = base_plumber_page
+        else:
+            filtered_page = base_plumber_page
+        cropped = filtered_page.crop(self.bbox)
         # Extract all tables from the cropped area
         tables = cropped.extract_tables(table_settings)
-        # Return the tables or an empty list if none found
-        return tables if tables else []
+        # Apply RTL text processing to all tables
+        if tables:
+            processed_tables = []
+            for table in tables:
+                processed_table = []
+                for row in table:
+                    processed_row = []
+                    for cell in row:
+                        if cell is not None:
+                            # Apply RTL text processing to each cell
+                            rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
+                            processed_row.append(rtl_processed_cell)
+                        else:
+                            processed_row.append(cell)
+                    processed_table.append(processed_row)
+                processed_tables.append(processed_table)
+            return processed_tables
+        # Return empty list if no tables found
+        return []
     def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
         """
@@ -1654,29 +1717,63 @@ class Region(
             if y_tol is not None:
                 table_settings.setdefault("text_y_tolerance", y_tol)
-        # Create a crop of the page for this region
-        cropped = self.page._page.crop(self.bbox)
+        # -------------------------------------------------------------
+        # Apply char-level exclusion filtering (chars only) just like in
+        # _extract_tables_plumber so header/footer text does not appear
+        # in extracted tables.
+        # -------------------------------------------------------------
+        base_plumber_page = self.page._page
+        if getattr(self.page, "_exclusions", None):
+            exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
+            def _keep_char(obj):
+                if obj.get("object_type") != "char":
+                    return True
+                cx = (obj["x0"] + obj["x1"]) / 2.0
+                cy = (obj["top"] + obj["bottom"]) / 2.0
+                for reg in exclusion_regions:
+                    if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
+                        return False
+                return True
+            try:
+                filtered_page = base_plumber_page.filter(_keep_char)
+            except Exception as _filter_err:
+                logger.warning(
+                    f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions (single table): {_filter_err}"
+                )
+                filtered_page = base_plumber_page
+        else:
+            filtered_page = base_plumber_page
+        # Now crop the (possibly filtered) page to the region bbox
+        cropped = filtered_page.crop(self.bbox)
         # Extract the single largest table from the cropped area
         table = cropped.extract_table(table_settings)
         # Return the table or an empty list if none found
         if table:
-            # Apply content filtering if provided
-            if content_filter is not None:
-                filtered_table = []
-                for row in table:
-                    filtered_row = []
-                    for cell in row:
-                        if cell is not None:
-                            # Apply content filter to cell text
-                            filtered_cell = self._apply_content_filter_to_text(cell, content_filter)
-                            filtered_row.append(filtered_cell)
+            # Apply RTL text processing and content filtering if provided
+            processed_table = []
+            for row in table:
+                processed_row = []
+                for cell in row:
+                    if cell is not None:
+                        # Apply RTL text processing first
+                        rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
+                        # Then apply content filter if provided
+                        if content_filter is not None:
+                            filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
+                            processed_row.append(filtered_cell)
                         else:
-                            filtered_row.append(cell)
-                    filtered_table.append(filtered_row)
-                return filtered_table
-            return table
+                            processed_row.append(rtl_processed_cell)
+                    else:
+                        processed_row.append(cell)
+                processed_table.append(processed_row)
+            return processed_table
         return []
     def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
@@ -2985,45 +3082,20 @@ class Region(
         source_info = f" source='{self.source}'" if self.source else ""
         return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
-    def correct_ocr(
+    def update_text(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
-    ) -> "Region":  # Return self for chaining
-        """
-        Applies corrections to OCR-generated text elements within this region
-        using a user-provided callback function.
-        Finds text elements within this region whose 'source' attribute starts
-        with 'ocr' and calls the `correction_callback` for each, passing the
-        element itself.
-        The `correction_callback` should contain the logic to:
-        1. Determine if the element needs correction.
-        2. Perform the correction (e.g., call an LLM).
-        3. Return the new text (`str`) or `None`.
-        If the callback returns a string, the element's `.text` is updated.
-        Metadata updates (source, confidence, etc.) should happen within the callback.
-        Args:
-            correction_callback: A function accepting an element and returning
-                                 `Optional[str]` (new text or None).
+        transform: Callable[[Any], Optional[str]],
+        *,
+        selector: str = "text",
+        apply_exclusions: bool = False,
+    ) -> "Region":
+        """Apply *transform* to every text element matched by *selector* inside this region.
-        Returns:
-            Self for method chaining.
+        The heavy lifting is delegated to :py:meth:`TextMixin.update_text`; this
+        override simply ensures the search is scoped to the region.
         """
-        # Find OCR elements specifically within this region
-        # Note: We typically want to correct even if the element falls in an excluded area
-        target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
-        # Delegate to the utility function
-        _apply_ocr_correction_to_elements(
-            elements=target_elements,  # Pass the ElementCollection directly
-            correction_callback=correction_callback,
-            caller_info=f"Region({self.bbox})",  # Pass caller info
-        )
-        return self  # Return self for chaining
+        return TextMixin.update_text(self, transform, selector=selector, apply_exclusions=apply_exclusions)
     # --- Classification Mixin Implementation --- #
     def _get_classification_manager(self) -> "ClassificationManager":
@@ -3490,6 +3562,54 @@ class Region(
         return table_grid
+    def _apply_rtl_processing_to_text(self, text: str) -> str:
+        """
+        Apply RTL (Right-to-Left) text processing to a string.
+        This converts visual order text (as stored in PDFs) to logical order
+        for proper display of Arabic, Hebrew, and other RTL scripts.
+        Args:
+            text: Input text string in visual order
+        Returns:
+            Text string in logical order
+        """
+        if not text or not text.strip():
+            return text
+        # Quick check for RTL characters - if none found, return as-is
+        import unicodedata
+        def _contains_rtl(s):
+            return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
+        if not _contains_rtl(text):
+            return text
+        try:
+            from bidi.algorithm import get_display  # type: ignore
+            from natural_pdf.utils.bidi_mirror import mirror_brackets
+            # Apply BiDi algorithm to convert from visual to logical order
+            # Process line by line to handle mixed content properly
+            processed_lines = []
+            for line in text.split("\n"):
+                if line.strip():
+                    # Determine base direction for this line
+                    base_dir = "R" if _contains_rtl(line) else "L"
+                    logical_line = get_display(line, base_dir=base_dir)
+                    # Apply bracket mirroring for correct logical order
+                    processed_lines.append(mirror_brackets(logical_line))
+                else:
+                    processed_lines.append(line)
+            return "\n".join(processed_lines)
+        except (ImportError, Exception):
+            # If bidi library is not available or fails, return original text
+            return text
     def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
         """
         Apply content filter to a text string.

natural_pdf/flows/element.py CHANGED Viewed

@@ -73,6 +73,31 @@ class FlowElement:
         """Returns the physical page of the underlying element."""
         return getattr(self.physical_object, "page", None)
+    def __getattr__(self, name: str) -> Any:
+        """
+        Delegate unknown attribute access to the physical_object.
+        This ensures that attributes like 'type', 'region_type', 'source', 'model', etc.
+        from the physical element are accessible on the FlowElement wrapper.
+        Args:
+            name: The attribute name being accessed
+        Returns:
+            The attribute value from physical_object
+        Raises:
+            AttributeError: If the attribute doesn't exist on physical_object either
+        """
+        try:
+            return getattr(self.physical_object, name)
+        except AttributeError:
+            # Provide a helpful error message that mentions both FlowElement and physical_object
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute '{name}' "
+                f"(also not found on underlying {type(self.physical_object).__name__})"
+            )
     def _flow_direction(
         self,
         direction: str,  # "above", "below", "left", "right"

natural-pdf 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl

natural-pdf 0.1.37py3-none-any.whl → 0.1.40py3-none-any.whl