PyPI - natural-pdf - Versions diffs - 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl - Mend

natural-pdf 0.1.23py3-none-any.whl → 0.1.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

natural_pdf/analyzers/shape_detection_mixin.py +40 -0
natural_pdf/core/highlighting_service.py +4 -4
natural_pdf/core/page.py +16 -2
natural_pdf/describe/base.py +11 -1
natural_pdf/describe/summary.py +26 -0
natural_pdf/elements/base.py +2 -2
natural_pdf/elements/collections.py +139 -100
natural_pdf/elements/region.py +133 -12
natural_pdf/elements/text.py +15 -7
natural_pdf/flows/region.py +116 -1
natural_pdf/qa/document_qa.py +162 -105
natural_pdf/utils/text_extraction.py +34 -14
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +2 -1
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +18 -18
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -1210,6 +1210,24 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                 # Try lattice first, then fall back to stream if no meaningful results
                 logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
+                # --- NEW: Prefer already-created table_cell regions if they exist --- #
+                try:
+                    cell_regions_in_table = [
+                        c
+                        for c in self.page.find_all("region[type=table_cell]", apply_exclusions=False)
+                        if self.intersects(c)
+                    ]
+                except Exception as _cells_err:
+                    cell_regions_in_table = []  # Fallback silently
+                if cell_regions_in_table:
+                    logger.debug(
+                        f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
+                    )
+                    return self._extract_table_from_cells(cell_regions_in_table)
+                # --------------------------------------------------------------- #
                 try:
                     logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
                     lattice_result = self.extract_table(
@@ -1905,19 +1923,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             logger.info(
                 f"Region {self.bbox}: Removing existing OCR elements before applying new OCR."
             )
-            # Find all OCR elements in this region
-            ocr_selector = "text[source=ocr]"
-            ocr_elements = self.find_all(ocr_selector)
+            # Remove existing OCR word elements strictly inside this region
+            ocr_selector = "text[source=ocr]"
+            ocr_elements = self.find_all(ocr_selector, apply_exclusions=False)
             if ocr_elements:
+                removed_count = ocr_elements.remove()
                 logger.info(
-                    f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove."
+                    f"Region {self.bbox}: Removed {removed_count} existing OCR word elements in region before re-applying OCR."
                 )
-                # Remove these elements from their page
-                removed_count = ocr_elements.remove()
-                logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
             else:
-                logger.info(f"Region {self.bbox}: No existing OCR elements found to remove.")
+                logger.info(
+                    f"Region {self.bbox}: No existing OCR word elements found within region to remove."
+                )
         ocr_mgr = self.page._parent._ocr_manager
@@ -1978,8 +1996,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                 page_top = self.top + (img_top * scale_y)
                 page_x1 = self.x0 + (img_x1 * scale_x)
                 page_bottom = self.top + (img_bottom * scale_y)
+                raw_conf = result.get("confidence")
+                # Convert confidence to float unless it is None/invalid
+                try:
+                    confidence_val = float(raw_conf) if raw_conf is not None else None
+                except (TypeError, ValueError):
+                    confidence_val = None
+                text_val = result.get("text")  # May legitimately be None in detect_only mode
                 element_data = {
-                    "text": result["text"],
+                    "text": text_val,
                     "x0": page_x0,
                     "top": page_top,
                     "x1": page_x1,
@@ -1988,7 +2015,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                     "height": page_bottom - page_top,
                     "object_type": "word",
                     "source": "ocr",
-                    "confidence": float(result.get("confidence", 0.0)),
+                    "confidence": confidence_val,
                     "fontname": "OCR",
                     "size": round(pdf_height) if pdf_height > 0 else 10.0,
                     "page_number": self.page.number,
@@ -2324,12 +2351,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
     def ask(
         self,
-        question: str,
+        question: Union[str, List[str], Tuple[str, ...]],
         min_confidence: float = 0.1,
         model: str = None,
         debug: bool = False,
         **kwargs,
-    ) -> Dict[str, Any]:
+    ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
         """
         Ask a question about the region content using document QA.
@@ -2870,4 +2897,98 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             self.metadata = {}
         self.metadata["analysis"] = value
+    # ------------------------------------------------------------------
+    # New helper: build table from pre-computed table_cell regions
+    # ------------------------------------------------------------------
+    def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
+        """Construct a table (list-of-lists) from table_cell regions.
+        This assumes each cell Region has metadata.row_index / col_index as written by
+        detect_table_structure_from_lines().  If these keys are missing we will
+        fall back to sorting by geometry.
+        """
+        if not cell_regions:
+            return []
+        # Attempt to use explicit indices first
+        all_row_idxs = []
+        all_col_idxs = []
+        for cell in cell_regions:
+            try:
+                r_idx = int(cell.metadata.get("row_index"))
+                c_idx = int(cell.metadata.get("col_index"))
+                all_row_idxs.append(r_idx)
+                all_col_idxs.append(c_idx)
+            except Exception:
+                # Not all cells have indices – clear the lists so we switch to geometric sorting
+                all_row_idxs = []
+                all_col_idxs = []
+                break
+        if all_row_idxs and all_col_idxs:
+            num_rows = max(all_row_idxs) + 1
+            num_cols = max(all_col_idxs) + 1
+            # Initialise blank grid
+            table_grid: List[List[Optional[str]]] = [[None] * num_cols for _ in range(num_rows)]
+            for cell in cell_regions:
+                try:
+                    r_idx = int(cell.metadata.get("row_index"))
+                    c_idx = int(cell.metadata.get("col_index"))
+                    text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
+                    table_grid[r_idx][c_idx] = text_val if text_val else None
+                except Exception as _err:
+                    # Skip problematic cell
+                    continue
+            return table_grid
+        # ------------------------------------------------------------------
+        # Fallback: derive order purely from geometry if indices are absent
+        # ------------------------------------------------------------------
+        # Sort unique centers to define ordering
+        try:
+            import numpy as np
+        except ImportError:
+            logger.warning("NumPy required for geometric cell ordering; returning empty result.")
+            return []
+        # Build arrays of centers
+        centers = np.array([
+            [(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions
+        ])
+        xs = centers[:, 0]
+        ys = centers[:, 1]
+        # Cluster unique row Y positions and column X positions with a tolerance
+        def _cluster(vals, tol=1.0):
+            sorted_vals = np.sort(vals)
+            groups = [[sorted_vals[0]]]
+            for v in sorted_vals[1:]:
+                if abs(v - groups[-1][-1]) <= tol:
+                    groups[-1].append(v)
+                else:
+                    groups.append([v])
+            return [np.mean(g) for g in groups]
+        row_centers = _cluster(ys)
+        col_centers = _cluster(xs)
+        num_rows = len(row_centers)
+        num_cols = len(col_centers)
+        table_grid: List[List[Optional[str]]] = [[None] * num_cols for _ in range(num_rows)]
+        # Assign each cell to nearest row & col center
+        for cell, (cx, cy) in zip(cell_regions, centers):
+            row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
+            col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
+            text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
+            table_grid[row_idx][col_idx] = text_val if text_val else None
+        return table_grid

natural_pdf/elements/text.py CHANGED Viewed

@@ -151,20 +151,28 @@ class TextElement(Element):
         # Default to black
         return (0, 0, 0)
-    def extract_text(self, keep_blank_chars=True, **kwargs) -> str:
+    def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
         """
         Extract text from this element.
         Args:
-            keep_blank_chars: Whether to keep blank characters (default: True)
-            **kwargs: Additional extraction parameters
+            keep_blank_chars: Retained for API compatibility (unused).
+            strip: If True (default) remove leading/trailing whitespace. Users may
+                   pass ``strip=False`` to preserve whitespace exactly as stored.
+            **kwargs: Accepted for forward-compatibility and ignored here.
         Returns:
-            Text content
+            The text content, optionally stripped.
         """
-        # For text elements, keep_blank_chars doesn't affect anything as we're
-        # simply returning the text property. Included for API consistency.
-        return self.text
+        # Basic retrieval
+        result = self.text or ""
+        # Apply optional stripping – align with global convention where simple
+        # element extraction is stripped by default.
+        if strip:
+            result = result.strip()
+        return result
     def contains(self, substring: str, case_sensitive: bool = True) -> bool:
         """

natural_pdf/flows/region.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Callable
 from pdfplumber.utils.geometry import objects_to_bbox  # For calculating combined bbox
@@ -519,3 +519,118 @@ class FlowRegion:
             )
         except Exception:
             return True  # If error during check, assume empty to be safe
+    # ------------------------------------------------------------------
+    # Table extraction helpers (delegates to underlying physical regions)
+    # ------------------------------------------------------------------
+    def extract_table(
+        self,
+        method: Optional[str] = None,
+        table_settings: Optional[dict] = None,
+        use_ocr: bool = False,
+        ocr_config: Optional[dict] = None,
+        text_options: Optional[Dict] = None,
+        cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
+        show_progress: bool = False,
+        **kwargs,
+    ) -> List[List[Optional[str]]]:
+        """Extracts a single logical table from the FlowRegion.
+        This is a convenience wrapper that iterates through the constituent
+        physical regions **in flow order**, calls their ``extract_table``
+        method, and concatenates the resulting rows.  It mirrors the public
+        interface of :pymeth:`natural_pdf.elements.region.Region.extract_table`.
+        Args:
+            method, table_settings, use_ocr, ocr_config, text_options, cell_extraction_func, show_progress:
+                Same as in :pymeth:`Region.extract_table` and are forwarded as-is
+                to each physical region.
+            **kwargs: Additional keyword arguments forwarded to the underlying
+                ``Region.extract_table`` implementation.
+        Returns:
+            A list of rows (``List[List[Optional[str]]]``).  Rows returned from
+            consecutive constituent regions are appended in document order.  If
+            no tables are detected in any region, an empty list is returned.
+        """
+        if table_settings is None:
+            table_settings = {}
+        if text_options is None:
+            text_options = {}
+        if not self.constituent_regions:
+            return []
+        aggregated_rows: List[List[Optional[str]]] = []
+        for region in self.constituent_regions:
+            try:
+                region_rows = region.extract_table(
+                    method=method,
+                    table_settings=table_settings.copy(),  # Avoid side-effects
+                    use_ocr=use_ocr,
+                    ocr_config=ocr_config,
+                    text_options=text_options.copy(),
+                    cell_extraction_func=cell_extraction_func,
+                    show_progress=show_progress,
+                    **kwargs,
+                )
+                # ``region_rows`` can legitimately be [] if no table found.
+                if region_rows:
+                    aggregated_rows.extend(region_rows)
+            except Exception as e:
+                logger.error(
+                    f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
+                    exc_info=True,
+                )
+        return aggregated_rows
+    def extract_tables(
+        self,
+        method: Optional[str] = None,
+        table_settings: Optional[dict] = None,
+        **kwargs,
+    ) -> List[List[List[Optional[str]]]]:
+        """Extract **all** tables from the FlowRegion.
+        This simply chains :pymeth:`Region.extract_tables` over each physical
+        region and concatenates their results, preserving flow order.
+        Args:
+            method, table_settings: Forwarded to underlying ``Region.extract_tables``.
+            **kwargs: Additional keyword arguments forwarded.
+        Returns:
+            A list where each item is a full table (list of rows).  The order of
+            tables follows the order of the constituent regions in the flow.
+        """
+        if table_settings is None:
+            table_settings = {}
+        if not self.constituent_regions:
+            return []
+        all_tables: List[List[List[Optional[str]]]] = []
+        for region in self.constituent_regions:
+            try:
+                region_tables = region.extract_tables(
+                    method=method,
+                    table_settings=table_settings.copy(),
+                    **kwargs,
+                )
+                # ``region_tables`` is a list (possibly empty).
+                if region_tables:
+                    all_tables.extend(region_tables)
+            except Exception as e:
+                logger.error(
+                    f"FlowRegion.extract_tables: Error extracting tables from constituent region {region}: {e}",
+                    exc_info=True,
+                )
+        return all_tables

natural-pdf 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl

natural-pdf 0.1.23py3-none-any.whl → 0.1.24py3-none-any.whl