PyPI - natural-pdf - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

natural-pdf 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

natural_pdf/analyzers/guides.py +185 -9
natural_pdf/core/element_manager.py +5 -0
natural_pdf/core/page.py +42 -4
natural_pdf/core/pdf.py +45 -3
natural_pdf/core/pdf_collection.py +131 -4
natural_pdf/core/render_spec.py +2 -2
natural_pdf/elements/base.py +18 -14
natural_pdf/elements/region.py +42 -21
natural_pdf/tables/result.py +39 -6
natural_pdf/vision/__init__.py +7 -0
natural_pdf/vision/mixin.py +209 -0
natural_pdf/vision/results.py +146 -0
natural_pdf/vision/similarity.py +321 -0
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/METADATA +1 -1
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/RECORD +19 -15
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/top_level.txt +0 -0

natural_pdf/elements/base.py CHANGED Viewed

@@ -1192,7 +1192,7 @@ class Element(
         self,
         mode: Literal["show", "render"] = "show",
         color: Optional[Union[str, Tuple[int, int, int]]] = None,
-        highlights: Optional[List[Dict[str, Any]]] = None,
+        highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
         crop: Union[bool, Literal["content"]] = False,
         crop_bbox: Optional[Tuple[float, float, float, float]] = None,
         label: Optional[str] = None,
@@ -1203,7 +1203,7 @@ class Element(
         Args:
             mode: Rendering mode - 'show' includes highlights, 'render' is clean
             color: Color for highlighting this element in show mode
-            highlights: Additional highlight groups to show
+            highlights: Additional highlight groups to show, or False to disable all highlights
             crop: Whether to crop to element bounds
             crop_bbox: Explicit crop bounds
             label: Optional label for this element
@@ -1225,19 +1225,23 @@ class Element(
             if hasattr(self, "bbox") and self.bbox:
                 spec.crop_bbox = self.bbox
-        # Add highlight in show mode
-        if mode == "show":
-            # Use provided label or generate one
-            element_label = label if label is not None else self.__class__.__name__
-            spec.add_highlight(
-                element=self,
-                color=color or "red",  # Default red for single element
-                label=element_label,
-            )
+        # Add highlight in show mode (unless explicitly disabled with highlights=False)
+        if mode == "show" and highlights is not False:
+            # Only highlight this element if:
+            # 1. We're not cropping, OR
+            # 2. We're cropping but color was explicitly specified
+            if not crop or color is not None:
+                # Use provided label or generate one
+                element_label = label if label is not None else self.__class__.__name__
+                spec.add_highlight(
+                    element=self,
+                    color=color or "red",  # Default red for single element
+                    label=element_label,
+                )
-            # Add additional highlight groups if provided
-            if highlights:
+            # Add additional highlight groups if provided (and highlights is a list)
+            if highlights and isinstance(highlights, list):
                 for group in highlights:
                     group_elements = group.get("elements", [])
                     group_color = group.get("color", color)

natural_pdf/elements/region.py CHANGED Viewed

@@ -221,7 +221,7 @@ class Region(
         self,
         mode: Literal["show", "render"] = "show",
         color: Optional[Union[str, Tuple[int, int, int]]] = None,
-        highlights: Optional[List[Dict[str, Any]]] = None,
+        highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
         crop: Union[bool, Literal["content"]] = True,  # Default to True for regions
         crop_bbox: Optional[Tuple[float, float, float, float]] = None,
         **kwargs,
@@ -231,7 +231,7 @@ class Region(
         Args:
             mode: Rendering mode - 'show' includes highlights, 'render' is clean
             color: Color for highlighting this region in show mode
-            highlights: Additional highlight groups to show
+            highlights: Additional highlight groups to show, or False to disable all highlights
             crop: Whether to crop to this region
             crop_bbox: Explicit crop bounds (overrides region bounds)
             **kwargs: Additional parameters
@@ -250,10 +250,12 @@ class Region(
             # Crop to this region's bounds
             spec.crop_bbox = self.bbox
-        # Add highlights in show mode
-        if mode == "show":
-            # Highlight this region
-            if color or mode == "show":  # Always highlight in show mode
+        # Add highlights in show mode (unless explicitly disabled with highlights=False)
+        if mode == "show" and highlights is not False:
+            # Only highlight this region if:
+            # 1. We're not cropping, OR
+            # 2. We're cropping but color was explicitly specified
+            if not crop or color is not None:
                 spec.add_highlight(
                     bbox=self.bbox,
                     polygon=self.polygon if self.has_polygon else None,
@@ -261,8 +263,8 @@ class Region(
                     label=self.label or self.name or "Region",
                 )
-            # Add additional highlight groups if provided
-            if highlights:
+            # Add additional highlight groups if provided (and highlights is a list)
+            if highlights and isinstance(highlights, list):
                 for group in highlights:
                     elements = group.get("elements", [])
                     group_color = group.get("color", color)
@@ -1234,6 +1236,7 @@ class Region(
         content_filter: Optional[
             Union[str, Callable[[str], bool], List[str]]
         ] = None,  # NEW: Content filtering
+        apply_exclusions: bool = True,  # Whether to apply exclusion regions during extraction
     ) -> TableResult:  # Return type allows Optional[str] for cells
         """
         Extract a table from this region.
@@ -1258,6 +1261,8 @@ class Region(
                 - A callable that takes text and returns True to KEEP the character
                 - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
                 Works with all extraction methods by filtering cell content.
+            apply_exclusions: Whether to apply exclusion regions during text extraction (default: True).
+                When True, text within excluded regions (e.g., headers/footers) will not be extracted.
         Returns:
             Table data as a list of rows, where each row is a list of cell values (str or None).
@@ -1295,7 +1300,9 @@ class Region(
                     )
                     return TableResult(
                         self._extract_table_from_cells(
-                            cell_regions_in_table, content_filter=content_filter
+                            cell_regions_in_table,
+                            content_filter=content_filter,
+                            apply_exclusions=apply_exclusions,
                         )
                     )
@@ -1379,16 +1386,22 @@ class Region(
         # Use the selected method
         if effective_method == "tatr":
             table_rows = self._extract_table_tatr(
-                use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter
+                use_ocr=use_ocr,
+                ocr_config=ocr_config,
+                content_filter=content_filter,
+                apply_exclusions=apply_exclusions,
             )
         elif effective_method == "text":
             current_text_options = text_options.copy()
             current_text_options["cell_extraction_func"] = cell_extraction_func
             current_text_options["show_progress"] = show_progress
             current_text_options["content_filter"] = content_filter
+            current_text_options["apply_exclusions"] = apply_exclusions
             table_rows = self._extract_table_text(**current_text_options)
         elif effective_method == "pdfplumber":
-            table_rows = self._extract_table_plumber(table_settings, content_filter=content_filter)
+            table_rows = self._extract_table_plumber(
+                table_settings, content_filter=content_filter, apply_exclusions=apply_exclusions
+            )
         else:
             raise ValueError(
                 f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
@@ -1602,7 +1615,9 @@ class Region(
         # Return empty list if no tables found
         return []
-    def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
+    def _extract_table_plumber(
+        self, table_settings: dict, content_filter=None, apply_exclusions=True
+    ) -> List[List[str]]:
         """
         Extract table using pdfplumber's table extraction.
         This method extracts the largest table within the region.
@@ -1644,7 +1659,7 @@ class Region(
         # -------------------------------------------------------------
         base_plumber_page = self.page._page
-        if getattr(self.page, "_exclusions", None):
+        if apply_exclusions and getattr(self.page, "_exclusions", None):
             exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
             def _keep_char(obj):
@@ -1699,7 +1714,7 @@ class Region(
         return []
     def _extract_table_tatr(
-        self, use_ocr=False, ocr_config=None, content_filter=None
+        self, use_ocr=False, ocr_config=None, content_filter=None, apply_exclusions=True
     ) -> List[List[str]]:
         """
         Extract table using TATR structure detection.
@@ -1787,7 +1802,7 @@ class Region(
                             continue
                 # Fallback to normal extraction
-                header_text = header.extract_text().strip()
+                header_text = header.extract_text(apply_exclusions=apply_exclusions).strip()
                 if content_filter is not None:
                     header_text = self._apply_content_filter_to_text(header_text, content_filter)
                 header_texts.append(header_text)
@@ -1822,7 +1837,7 @@ class Region(
                                 continue
                     # Fallback to normal extraction
-                    cell_text = cell_region.extract_text().strip()
+                    cell_text = cell_region.extract_text(apply_exclusions=apply_exclusions).strip()
                     if content_filter is not None:
                         cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
                     row_cells.append(cell_text)
@@ -1838,7 +1853,7 @@ class Region(
                             continue
                 # Fallback to normal extraction
-                row_text = row.extract_text().strip()
+                row_text = row.extract_text(apply_exclusions=apply_exclusions).strip()
                 if content_filter is not None:
                     row_text = self._apply_content_filter_to_text(row_text, content_filter)
                 row_cells.append(row_text)
@@ -1864,6 +1879,8 @@ class Region(
         show_progress = text_options.pop("show_progress", False)
         # --- Get content_filter option --- #
         content_filter = text_options.pop("content_filter", None)
+        # --- Get apply_exclusions option --- #
+        apply_exclusions = text_options.pop("apply_exclusions", True)
         # Analyze structure first (or use cached results)
         if "text_table_structure" in self.analyses:
@@ -1944,7 +1961,9 @@ class Region(
                         cell_value = None
                 else:
                     cell_value = cell_region.extract_text(
-                        layout=False, apply_exclusions=False, content_filter=content_filter
+                        layout=False,
+                        apply_exclusions=apply_exclusions,
+                        content_filter=content_filter,
                     ).strip()
                 rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
@@ -3395,7 +3414,7 @@ class Region(
     # ------------------------------------------------------------------
     def _extract_table_from_cells(
-        self, cell_regions: List["Region"], content_filter=None
+        self, cell_regions: List["Region"], content_filter=None, apply_exclusions=True
     ) -> List[List[Optional[str]]]:
         """Construct a table (list-of-lists) from table_cell regions.
@@ -3437,7 +3456,9 @@ class Region(
                     r_idx = int(cell.metadata.get("row_index"))
                     c_idx = int(cell.metadata.get("col_index"))
                     text_val = cell.extract_text(
-                        layout=False, apply_exclusions=True, content_filter=content_filter
+                        layout=False,
+                        apply_exclusions=apply_exclusions,
+                        content_filter=content_filter,
                     ).strip()
                     table_grid[r_idx][c_idx] = text_val if text_val else None
                 except Exception as _err:
@@ -3486,7 +3507,7 @@ class Region(
             col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
             text_val = cell.extract_text(
-                layout=False, apply_exclusions=False, content_filter=content_filter
+                layout=False, apply_exclusions=apply_exclusions, content_filter=content_filter
             ).strip()
             table_grid[row_idx][col_idx] = text_val if text_val else None

natural_pdf/tables/result.py CHANGED Viewed

@@ -41,7 +41,7 @@ class TableResult(Sequence):
     def to_df(
         self,
-        header: Union[str, int, List[int], None] = "first",
+        header: Union[str, int, List[int], List[str], None] = "first",
         index_col=None,
         skip_repeating_headers=None,
         keep_blank: bool = False,
@@ -51,8 +51,8 @@ class TableResult(Sequence):
         Parameters
         ----------
-        header : "first" | int | list[int] | None, default "first"
-            • "first" – use row 0 as column names.\n            • int       – use that row index.\n            • list[int] – multi-row header.\n            • None/False– no header.
+        header : "first" | int | list[int] | list[str] | None, default "first"
+            • "first" – use row 0 as column names.\n            • int       – use that row index.\n            • list[int] – multi-row header.\n            • list[str] – custom column names.\n            • None/False– no header.
             Note: If the header row has a different number of columns than the
             body rows, the method will automatically fall back to header=None
@@ -84,7 +84,11 @@ class TableResult(Sequence):
         # Determine default for skip_repeating_headers based on header parameter
         if skip_repeating_headers is None:
-            skip_repeating_headers = header is not None and header is not False
+            skip_repeating_headers = (
+                header is not None
+                and header is not False
+                and not (isinstance(header, (list, tuple)) and len(header) == 0)
+            )
         # Determine header rows and body rows
         body = rows
@@ -97,10 +101,31 @@ class TableResult(Sequence):
         elif isinstance(header, int):
             hdr = rows[header]
             body = rows[:header] + rows[header + 1 :]
-        elif isinstance(header, (list, tuple)):
+        elif isinstance(header, (list, tuple)) and all(isinstance(i, int) for i in header):
+            # List of integers - multi-row header
             hdr_rows = [rows[i] for i in header]
             body = [r for idx, r in enumerate(rows) if idx not in header]
             hdr = hdr_rows
+        elif (
+            isinstance(header, (list, tuple))
+            and len(header) > 0
+            and all(isinstance(i, str) for i in header)
+        ):
+            # List of strings - custom column names
+            hdr = list(header)
+            body = rows
+            # Validate column count matches
+            if body:
+                max_cols = max(len(row) for row in body)
+                if len(hdr) != max_cols:
+                    raise ValueError(
+                        f"Number of column names ({len(hdr)}) must match "
+                        f"number of columns in data ({max_cols})"
+                    )
+        elif isinstance(header, (list, tuple)) and len(header) == 0:
+            # Empty list behaves like None
+            hdr = None
+            body = rows
         else:
             raise ValueError("Invalid value for header parameter")
@@ -125,7 +150,12 @@ class TableResult(Sequence):
                 pass
         # Check for header/body column count mismatch and fallback to no header
-        if hdr is not None and body:
+        if (
+            hdr is not None
+            and body
+            and not (isinstance(header, (list, tuple)) and all(isinstance(i, str) for i in header))
+        ):
+            # Skip this check for custom string headers
             # Get the maximum number of columns from all body rows
             # This handles cases where some rows have different column counts
             max_cols = max(len(row) for row in body) if body else 0
@@ -144,6 +174,9 @@ class TableResult(Sequence):
                 hdr = None
                 body = self._rows  # Use all rows as body
+        # Handle empty list case - pandas needs None not empty list
+        if isinstance(hdr, list) and len(hdr) == 0:
+            hdr = None
         df = pd.DataFrame(body, columns=hdr)
         # Convert empty strings to NaN by default

natural_pdf/vision/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Vision module for visual similarity and pattern matching"""
+from .mixin import VisualSearchMixin
+from .results import Match, MatchResults
+from .similarity import VisualMatcher, compute_phash
+__all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]

natural_pdf/vision/mixin.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""Mixin to add visual similarity search to Page/PDF/PDFCollection"""
+from typing import List, Optional, Tuple, Union
+import numpy as np
+from PIL import Image
+from tqdm.auto import tqdm
+from .results import Match, MatchResults
+from .similarity import VisualMatcher, compute_phash
+class VisualSearchMixin:
+    """Add find_similar method to classes that include this mixin"""
+    def find_similar(
+        self,
+        examples: Union["Element", "Region", List[Union["Element", "Region"]]],
+        using: str = "vision",
+        confidence: float = 0.6,
+        sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
+        resolution: int = 72,
+        hash_size: int = 20,
+        step_factor: float = 0.1,
+        max_per_page: Optional[int] = None,
+        show_progress: bool = True,
+        **kwargs,
+    ) -> MatchResults:
+        """
+        Find regions visually similar to the given example(s).
+        Args:
+            examples: Single element/region or list of examples to search for
+            using: Search method - currently only 'vision' is supported
+            confidence: Minimum similarity score (0-1)
+            sizes: Size variations to search. Can be:
+                   - float: ±percentage (e.g., 0.2 = 80%-120%)
+                   - tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
+                   - tuple(min, max, step): explicit step size
+                   - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
+            resolution: Resolution for image comparison (DPI) (default: 72)
+            hash_size: Size of perceptual hash grid (default: 12)
+            step_factor: Step size as fraction of template size (default: 0.1)
+            max_per_page: Maximum matches to return per page
+            show_progress: Show progress bar for multi-page searches (default: True)
+            **kwargs: Additional options
+        Returns:
+            MatchResults collection
+        """
+        if using != "vision":
+            raise NotImplementedError(f"using='{using}' not yet supported")
+        # Ensure examples is a list
+        if not isinstance(examples, list):
+            examples = [examples]
+        # Initialize matcher with specified hash size
+        matcher = VisualMatcher(hash_size=hash_size)
+        # Prepare templates
+        templates = []
+        for example in examples:
+            # Render the example region/element
+            example_image = example.render(resolution=resolution, crop=True)
+            template_hash = compute_phash(example_image, hash_size=hash_size)
+            templates.append({"image": example_image, "hash": template_hash, "source": example})
+        # Get pages to search based on the object type
+        if hasattr(self, "__class__") and self.__class__.__name__ == "PDFCollection":
+            # PDFCollection needs to iterate through all PDFs
+            pages_to_search = []
+            for pdf in self:
+                pages_to_search.extend(pdf.pages)
+        elif hasattr(self, "pages"):  # PDF
+            pages_to_search = self.pages
+        elif hasattr(self, "number"):  # Single page
+            pages_to_search = [self]
+        else:
+            raise TypeError(f"Cannot search in {type(self)}")
+        # Calculate total operations for progress bar
+        total_operations = 0
+        if show_progress:
+            # Get scales that will be searched
+            scales = matcher._get_search_scales(sizes)
+            # Pre-calculate for all pages and templates
+            for page in pages_to_search:
+                # Estimate page image size
+                page_w = int(page.width * resolution / 72.0)
+                page_h = int(page.height * resolution / 72.0)
+                for template_data in templates:
+                    template_w, template_h = template_data["image"].size
+                    for scale in scales:
+                        scaled_w = int(template_w * scale)
+                        scaled_h = int(template_h * scale)
+                        if scaled_w <= page_w and scaled_h <= page_h:
+                            step_x = max(1, int(scaled_w * step_factor))
+                            step_y = max(1, int(scaled_h * step_factor))
+                            x_windows = len(range(0, page_w - scaled_w + 1, step_x))
+                            y_windows = len(range(0, page_h - scaled_h + 1, step_y))
+                            total_operations += x_windows * y_windows
+        # Search each page
+        all_matches = []
+        # Create single progress bar for all operations
+        progress_bar = None
+        operations_done = 0
+        last_update = 0
+        update_frequency = max(1, total_operations // 1000)  # Update at most 1000 times
+        if show_progress and total_operations > 0:
+            progress_bar = tqdm(
+                total=total_operations,
+                desc="Searching",
+                unit="window",
+                miniters=update_frequency,  # Minimum iterations between updates
+                mininterval=0.1,  # Minimum time between updates (seconds)
+            )
+        for page_idx, page in enumerate(pages_to_search):
+            # Render the full page once
+            page_image = page.render(resolution=resolution)
+            # Convert page coordinates to image coordinates
+            scale = resolution / 72.0  # PDF is 72 DPI
+            page_matches = []
+            # Search for each template
+            for template_idx, template_data in enumerate(templates):
+                template_image = template_data["image"]
+                template_hash = template_data["hash"]
+                # Custom progress callback to update our main progress bar
+                def update_progress():
+                    nonlocal operations_done, last_update
+                    operations_done += 1
+                    # Only update progress bar every N operations to avoid overwhelming output
+                    if progress_bar and (
+                        operations_done - last_update >= update_frequency
+                        or operations_done == total_operations
+                    ):
+                        progress_bar.update(operations_done - last_update)
+                        last_update = operations_done
+                        # Update description with current page/template info
+                        if len(pages_to_search) > 1:
+                            progress_bar.set_description(
+                                f"Page {page.number}/{len(pages_to_search)}"
+                            )
+                        elif len(templates) > 1:
+                            progress_bar.set_description(
+                                f"Template {template_idx + 1}/{len(templates)}"
+                            )
+                # Find matches in this page - never show internal progress
+                candidates = matcher.find_matches_in_image(
+                    template_image,
+                    page_image,
+                    template_hash=template_hash,
+                    confidence_threshold=confidence,
+                    sizes=sizes,
+                    step_factor=step_factor,
+                    show_progress=False,  # We handle progress ourselves
+                    progress_callback=update_progress if progress_bar else None,
+                    **kwargs,
+                )
+                # Convert image coordinates back to PDF coordinates
+                for candidate in candidates:
+                    img_x0, img_y0, img_x1, img_y1 = candidate.bbox
+                    # Convert from image pixels to PDF points
+                    # No flipping needed! PDF coordinates map directly to PIL coordinates
+                    pdf_x0 = img_x0 / scale
+                    pdf_y0 = img_y0 / scale
+                    pdf_x1 = img_x1 / scale
+                    pdf_y1 = img_y1 / scale
+                    # Create Match object
+                    match = Match(
+                        page=page,
+                        bbox=(pdf_x0, pdf_y0, pdf_x1, pdf_y1),
+                        confidence=candidate.confidence,
+                        source_example=template_data["source"],
+                    )
+                    page_matches.append(match)
+            # Apply max_per_page limit if specified
+            if max_per_page and len(page_matches) > max_per_page:
+                # Sort by confidence and take top N
+                page_matches.sort(key=lambda m: m.confidence, reverse=True)
+                page_matches = page_matches[:max_per_page]
+            all_matches.extend(page_matches)
+        # Close progress bar
+        if progress_bar:
+            progress_bar.close()
+        return MatchResults(all_matches)

natural-pdf 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

natural-pdf 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl