PyPI - natural-pdf - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

natural-pdf 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

natural_pdf/core/page.py +2 -0
natural_pdf/core/pdf.py +4 -1
natural_pdf/core/pdf_collection.py +131 -4
natural_pdf/core/render_spec.py +2 -2
natural_pdf/elements/base.py +18 -14
natural_pdf/elements/region.py +10 -8
natural_pdf/vision/__init__.py +7 -0
natural_pdf/vision/mixin.py +209 -0
natural_pdf/vision/results.py +146 -0
natural_pdf/vision/similarity.py +321 -0
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/METADATA +1 -1
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/RECORD +16 -12
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/top_level.txt +0 -0

natural_pdf/core/page.py CHANGED Viewed

@@ -78,6 +78,7 @@ from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
 # # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
+from natural_pdf.vision.mixin import VisualSearchMixin
 from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
 # --- End Classification Imports --- #
@@ -101,6 +102,7 @@ class Page(
     ExtractionMixin,
     ShapeDetectionMixin,
     DescribeMixin,
+    VisualSearchMixin,
     Visualizable,
 ):
     """Enhanced Page wrapper built on top of pdfplumber.Page.

natural_pdf/core/pdf.py CHANGED Viewed

@@ -42,6 +42,7 @@ from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.selectors.parser import parse_selector
 from natural_pdf.text_mixin import TextMixin
 from natural_pdf.utils.locks import pdf_render_lock
+from natural_pdf.vision.mixin import VisualSearchMixin
 if TYPE_CHECKING:
     from natural_pdf.elements.element_collection import ElementCollection
@@ -252,7 +253,9 @@ class _LazyPageList(Sequence):
 # --- End Lazy Page List Helper --- #
-class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, Visualizable):
+class PDF(
+    TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, VisualSearchMixin, Visualizable
+):
     """Enhanced PDF wrapper built on top of pdfplumber.
     This class provides a fluent interface for working with PDF documents,

natural_pdf/core/pdf_collection.py CHANGED Viewed

@@ -40,6 +40,7 @@ logger = logging.getLogger(__name__)
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.region import Region
 from natural_pdf.export.mixin import ExportMixin
+from natural_pdf.vision.mixin import VisualSearchMixin
 # --- Search Imports ---
 try:
@@ -69,8 +70,8 @@ from natural_pdf.search.searchable_mixin import SearchableMixin  # Import the ne
 class PDFCollection(
-    SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin
-):  # Add ExportMixin and ShapeDetectionMixin
+    SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin, VisualSearchMixin
+):
     def __init__(
         self,
         source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -258,8 +259,6 @@ class PDFCollection(
         return iter(self._pdfs)
     def __repr__(self) -> str:
-        # Removed search status
-        return f"<PDFCollection(count={len(self._pdfs)})>"
         return f"<PDFCollection(count={len(self._pdfs)})>"
     @property
@@ -267,6 +266,134 @@ class PDFCollection(
         """Returns the list of PDF objects held by the collection."""
         return self._pdfs
+    def show(self, limit: Optional[int] = 30, per_pdf_limit: Optional[int] = 10, **kwargs):
+        """
+        Display all PDFs in the collection with labels.
+        Each PDF is shown with its pages in a grid layout (6 columns by default),
+        and all PDFs are stacked vertically with labels.
+        Args:
+            limit: Maximum total pages to show across all PDFs (default: 30)
+            per_pdf_limit: Maximum pages to show per PDF (default: 10)
+            **kwargs: Additional arguments passed to each PDF's show() method
+                     (e.g., columns, exclusions, resolution, etc.)
+        Returns:
+            Displayed image in Jupyter or None
+        """
+        if not self._pdfs:
+            print("Empty collection")
+            return None
+        # Import here to avoid circular imports
+        import numpy as np
+        from PIL import Image, ImageDraw, ImageFont
+        # Calculate pages per PDF if total limit is set
+        if limit and not per_pdf_limit:
+            per_pdf_limit = max(1, limit // len(self._pdfs))
+        # Collect images from each PDF
+        all_images = []
+        total_pages_shown = 0
+        for pdf in self._pdfs:
+            if limit and total_pages_shown >= limit:
+                break
+            # Calculate limit for this PDF
+            pdf_limit = per_pdf_limit
+            if limit:
+                remaining = limit - total_pages_shown
+                pdf_limit = min(per_pdf_limit or remaining, remaining)
+            # Get PDF identifier
+            pdf_name = getattr(pdf, "filename", None) or getattr(pdf, "path", "Unknown")
+            if isinstance(pdf_name, Path):
+                pdf_name = pdf_name.name
+            elif "/" in str(pdf_name):
+                pdf_name = str(pdf_name).split("/")[-1]
+            # Render this PDF
+            try:
+                # Get render specs from the PDF
+                render_specs = pdf._get_render_specs(mode="show", max_pages=pdf_limit, **kwargs)
+                if not render_specs:
+                    continue
+                # Get the highlighter and render without displaying
+                highlighter = pdf._get_highlighter()
+                pdf_image = highlighter.unified_render(
+                    specs=render_specs,
+                    layout="grid" if len(render_specs) > 1 else "single",
+                    columns=6,
+                    **kwargs,
+                )
+                if pdf_image:
+                    # Add label above the PDF image
+                    label_height = 40
+                    label_bg_color = (240, 240, 240)
+                    label_text_color = (0, 0, 0)
+                    # Create new image with space for label
+                    width, height = pdf_image.size
+                    labeled_image = Image.new("RGB", (width, height + label_height), "white")
+                    # Draw label background
+                    draw = ImageDraw.Draw(labeled_image)
+                    draw.rectangle([0, 0, width, label_height], fill=label_bg_color)
+                    # Draw label text
+                    try:
+                        # Try to use a nice font if available
+                        font = ImageFont.truetype("Arial", 20)
+                    except:
+                        # Fallback to default font
+                        font = ImageFont.load_default()
+                    label_text = f"{pdf_name} ({len(pdf.pages)} pages)"
+                    draw.text((10, 10), label_text, fill=label_text_color, font=font)
+                    # Paste PDF image below label
+                    labeled_image.paste(pdf_image, (0, label_height))
+                    all_images.append(labeled_image)
+                    total_pages_shown += min(pdf_limit, len(pdf.pages))
+            except Exception as e:
+                logger.warning(f"Failed to render PDF {pdf_name}: {e}")
+                continue
+        if not all_images:
+            print("No PDFs could be rendered")
+            return None
+        # Combine all images vertically
+        if len(all_images) == 1:
+            combined = all_images[0]
+        else:
+            # Add spacing between PDFs
+            spacing = 20
+            total_height = sum(img.height for img in all_images) + spacing * (len(all_images) - 1)
+            max_width = max(img.width for img in all_images)
+            combined = Image.new("RGB", (max_width, total_height), "white")
+            y_offset = 0
+            for i, img in enumerate(all_images):
+                # Center images if they're narrower than max width
+                x_offset = (max_width - img.width) // 2
+                combined.paste(img, (x_offset, y_offset))
+                y_offset += img.height
+                if i < len(all_images) - 1:
+                    y_offset += spacing
+        # Return the combined image (Jupyter will display it automatically)
+        return combined
     @overload
     def find_all(
         self,

natural_pdf/core/render_spec.py CHANGED Viewed

@@ -186,7 +186,7 @@ class Visualizable:
         color: Optional[Union[str, Tuple[int, int, int]]] = None,
         labels: bool = True,
         label_format: Optional[str] = None,
-        highlights: Optional[List[Dict[str, Any]]] = None,
+        highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
         legend_position: str = "right",
         annotate: Optional[Union[str, List[str]]] = None,
         # Layout options for multi-page/region
@@ -211,7 +211,7 @@ class Visualizable:
             color: Default highlight color
             labels: Whether to show labels for highlights
             label_format: Format string for labels (e.g., "Element {index}")
-            highlights: Additional highlight groups to show
+            highlights: Additional highlight groups to show, or False to disable all highlights
             legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
             annotate: Attribute name(s) to display on highlights (string or list)
             layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)

natural_pdf/elements/base.py CHANGED Viewed

@@ -1192,7 +1192,7 @@ class Element(
         self,
         mode: Literal["show", "render"] = "show",
         color: Optional[Union[str, Tuple[int, int, int]]] = None,
-        highlights: Optional[List[Dict[str, Any]]] = None,
+        highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
         crop: Union[bool, Literal["content"]] = False,
         crop_bbox: Optional[Tuple[float, float, float, float]] = None,
         label: Optional[str] = None,
@@ -1203,7 +1203,7 @@ class Element(
         Args:
             mode: Rendering mode - 'show' includes highlights, 'render' is clean
             color: Color for highlighting this element in show mode
-            highlights: Additional highlight groups to show
+            highlights: Additional highlight groups to show, or False to disable all highlights
             crop: Whether to crop to element bounds
             crop_bbox: Explicit crop bounds
             label: Optional label for this element
@@ -1225,19 +1225,23 @@ class Element(
             if hasattr(self, "bbox") and self.bbox:
                 spec.crop_bbox = self.bbox
-        # Add highlight in show mode
-        if mode == "show":
-            # Use provided label or generate one
-            element_label = label if label is not None else self.__class__.__name__
-            spec.add_highlight(
-                element=self,
-                color=color or "red",  # Default red for single element
-                label=element_label,
-            )
+        # Add highlight in show mode (unless explicitly disabled with highlights=False)
+        if mode == "show" and highlights is not False:
+            # Only highlight this element if:
+            # 1. We're not cropping, OR
+            # 2. We're cropping but color was explicitly specified
+            if not crop or color is not None:
+                # Use provided label or generate one
+                element_label = label if label is not None else self.__class__.__name__
+                spec.add_highlight(
+                    element=self,
+                    color=color or "red",  # Default red for single element
+                    label=element_label,
+                )
-            # Add additional highlight groups if provided
-            if highlights:
+            # Add additional highlight groups if provided (and highlights is a list)
+            if highlights and isinstance(highlights, list):
                 for group in highlights:
                     group_elements = group.get("elements", [])
                     group_color = group.get("color", color)

natural_pdf/elements/region.py CHANGED Viewed

@@ -221,7 +221,7 @@ class Region(
         self,
         mode: Literal["show", "render"] = "show",
         color: Optional[Union[str, Tuple[int, int, int]]] = None,
-        highlights: Optional[List[Dict[str, Any]]] = None,
+        highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
         crop: Union[bool, Literal["content"]] = True,  # Default to True for regions
         crop_bbox: Optional[Tuple[float, float, float, float]] = None,
         **kwargs,
@@ -231,7 +231,7 @@ class Region(
         Args:
             mode: Rendering mode - 'show' includes highlights, 'render' is clean
             color: Color for highlighting this region in show mode
-            highlights: Additional highlight groups to show
+            highlights: Additional highlight groups to show, or False to disable all highlights
             crop: Whether to crop to this region
             crop_bbox: Explicit crop bounds (overrides region bounds)
             **kwargs: Additional parameters
@@ -250,10 +250,12 @@ class Region(
             # Crop to this region's bounds
             spec.crop_bbox = self.bbox
-        # Add highlights in show mode
-        if mode == "show":
-            # Highlight this region
-            if color or mode == "show":  # Always highlight in show mode
+        # Add highlights in show mode (unless explicitly disabled with highlights=False)
+        if mode == "show" and highlights is not False:
+            # Only highlight this region if:
+            # 1. We're not cropping, OR
+            # 2. We're cropping but color was explicitly specified
+            if not crop or color is not None:
                 spec.add_highlight(
                     bbox=self.bbox,
                     polygon=self.polygon if self.has_polygon else None,
@@ -261,8 +263,8 @@ class Region(
                     label=self.label or self.name or "Region",
                 )
-            # Add additional highlight groups if provided
-            if highlights:
+            # Add additional highlight groups if provided (and highlights is a list)
+            if highlights and isinstance(highlights, list):
                 for group in highlights:
                     elements = group.get("elements", [])
                     group_color = group.get("color", color)

natural_pdf/vision/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Vision module for visual similarity and pattern matching"""
+from .mixin import VisualSearchMixin
+from .results import Match, MatchResults
+from .similarity import VisualMatcher, compute_phash
+__all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]

natural_pdf/vision/mixin.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""Mixin to add visual similarity search to Page/PDF/PDFCollection"""
+from typing import List, Optional, Tuple, Union
+import numpy as np
+from PIL import Image
+from tqdm.auto import tqdm
+from .results import Match, MatchResults
+from .similarity import VisualMatcher, compute_phash
+class VisualSearchMixin:
+    """Add find_similar method to classes that include this mixin"""
+    def find_similar(
+        self,
+        examples: Union["Element", "Region", List[Union["Element", "Region"]]],
+        using: str = "vision",
+        confidence: float = 0.6,
+        sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
+        resolution: int = 72,
+        hash_size: int = 20,
+        step_factor: float = 0.1,
+        max_per_page: Optional[int] = None,
+        show_progress: bool = True,
+        **kwargs,
+    ) -> MatchResults:
+        """
+        Find regions visually similar to the given example(s).
+        Args:
+            examples: Single element/region or list of examples to search for
+            using: Search method - currently only 'vision' is supported
+            confidence: Minimum similarity score (0-1)
+            sizes: Size variations to search. Can be:
+                   - float: ±percentage (e.g., 0.2 = 80%-120%)
+                   - tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
+                   - tuple(min, max, step): explicit step size
+                   - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
+            resolution: Resolution for image comparison (DPI) (default: 72)
+            hash_size: Size of perceptual hash grid (default: 12)
+            step_factor: Step size as fraction of template size (default: 0.1)
+            max_per_page: Maximum matches to return per page
+            show_progress: Show progress bar for multi-page searches (default: True)
+            **kwargs: Additional options
+        Returns:
+            MatchResults collection
+        """
+        if using != "vision":
+            raise NotImplementedError(f"using='{using}' not yet supported")
+        # Ensure examples is a list
+        if not isinstance(examples, list):
+            examples = [examples]
+        # Initialize matcher with specified hash size
+        matcher = VisualMatcher(hash_size=hash_size)
+        # Prepare templates
+        templates = []
+        for example in examples:
+            # Render the example region/element
+            example_image = example.render(resolution=resolution, crop=True)
+            template_hash = compute_phash(example_image, hash_size=hash_size)
+            templates.append({"image": example_image, "hash": template_hash, "source": example})
+        # Get pages to search based on the object type
+        if hasattr(self, "__class__") and self.__class__.__name__ == "PDFCollection":
+            # PDFCollection needs to iterate through all PDFs
+            pages_to_search = []
+            for pdf in self:
+                pages_to_search.extend(pdf.pages)
+        elif hasattr(self, "pages"):  # PDF
+            pages_to_search = self.pages
+        elif hasattr(self, "number"):  # Single page
+            pages_to_search = [self]
+        else:
+            raise TypeError(f"Cannot search in {type(self)}")
+        # Calculate total operations for progress bar
+        total_operations = 0
+        if show_progress:
+            # Get scales that will be searched
+            scales = matcher._get_search_scales(sizes)
+            # Pre-calculate for all pages and templates
+            for page in pages_to_search:
+                # Estimate page image size
+                page_w = int(page.width * resolution / 72.0)
+                page_h = int(page.height * resolution / 72.0)
+                for template_data in templates:
+                    template_w, template_h = template_data["image"].size
+                    for scale in scales:
+                        scaled_w = int(template_w * scale)
+                        scaled_h = int(template_h * scale)
+                        if scaled_w <= page_w and scaled_h <= page_h:
+                            step_x = max(1, int(scaled_w * step_factor))
+                            step_y = max(1, int(scaled_h * step_factor))
+                            x_windows = len(range(0, page_w - scaled_w + 1, step_x))
+                            y_windows = len(range(0, page_h - scaled_h + 1, step_y))
+                            total_operations += x_windows * y_windows
+        # Search each page
+        all_matches = []
+        # Create single progress bar for all operations
+        progress_bar = None
+        operations_done = 0
+        last_update = 0
+        update_frequency = max(1, total_operations // 1000)  # Update at most 1000 times
+        if show_progress and total_operations > 0:
+            progress_bar = tqdm(
+                total=total_operations,
+                desc="Searching",
+                unit="window",
+                miniters=update_frequency,  # Minimum iterations between updates
+                mininterval=0.1,  # Minimum time between updates (seconds)
+            )
+        for page_idx, page in enumerate(pages_to_search):
+            # Render the full page once
+            page_image = page.render(resolution=resolution)
+            # Convert page coordinates to image coordinates
+            scale = resolution / 72.0  # PDF is 72 DPI
+            page_matches = []
+            # Search for each template
+            for template_idx, template_data in enumerate(templates):
+                template_image = template_data["image"]
+                template_hash = template_data["hash"]
+                # Custom progress callback to update our main progress bar
+                def update_progress():
+                    nonlocal operations_done, last_update
+                    operations_done += 1
+                    # Only update progress bar every N operations to avoid overwhelming output
+                    if progress_bar and (
+                        operations_done - last_update >= update_frequency
+                        or operations_done == total_operations
+                    ):
+                        progress_bar.update(operations_done - last_update)
+                        last_update = operations_done
+                        # Update description with current page/template info
+                        if len(pages_to_search) > 1:
+                            progress_bar.set_description(
+                                f"Page {page.number}/{len(pages_to_search)}"
+                            )
+                        elif len(templates) > 1:
+                            progress_bar.set_description(
+                                f"Template {template_idx + 1}/{len(templates)}"
+                            )
+                # Find matches in this page - never show internal progress
+                candidates = matcher.find_matches_in_image(
+                    template_image,
+                    page_image,
+                    template_hash=template_hash,
+                    confidence_threshold=confidence,
+                    sizes=sizes,
+                    step_factor=step_factor,
+                    show_progress=False,  # We handle progress ourselves
+                    progress_callback=update_progress if progress_bar else None,
+                    **kwargs,
+                )
+                # Convert image coordinates back to PDF coordinates
+                for candidate in candidates:
+                    img_x0, img_y0, img_x1, img_y1 = candidate.bbox
+                    # Convert from image pixels to PDF points
+                    # No flipping needed! PDF coordinates map directly to PIL coordinates
+                    pdf_x0 = img_x0 / scale
+                    pdf_y0 = img_y0 / scale
+                    pdf_x1 = img_x1 / scale
+                    pdf_y1 = img_y1 / scale
+                    # Create Match object
+                    match = Match(
+                        page=page,
+                        bbox=(pdf_x0, pdf_y0, pdf_x1, pdf_y1),
+                        confidence=candidate.confidence,
+                        source_example=template_data["source"],
+                    )
+                    page_matches.append(match)
+            # Apply max_per_page limit if specified
+            if max_per_page and len(page_matches) > max_per_page:
+                # Sort by confidence and take top N
+                page_matches.sort(key=lambda m: m.confidence, reverse=True)
+                page_matches = page_matches[:max_per_page]
+            all_matches.extend(page_matches)
+        # Close progress bar
+        if progress_bar:
+            progress_bar.close()
+        return MatchResults(all_matches)

natural_pdf/vision/results.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Match results for visual similarity search"""
+from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple
+# Import Region directly as it's a base class
+from natural_pdf.elements.region import Region
+if TYPE_CHECKING:
+    from natural_pdf.core.page_collection import PageCollection
+    from natural_pdf.elements.element_collection import ElementCollection
+class Match(Region):
+    """A region that was found via visual similarity search"""
+    def __init__(self, page, bbox, confidence, source_example=None, metadata=None):
+        """
+        Initialize a Match object.
+        Args:
+            page: Page containing the match
+            bbox: Bounding box of the match
+            confidence: Similarity confidence (0-1)
+            source_example: The example/template that led to this match
+            metadata: Additional metadata about the match
+        """
+        super().__init__(page, bbox)
+        self.confidence = confidence
+        self.source_example = source_example
+        self.metadata = metadata or {}
+    @property
+    def pdf(self):
+        """Get the PDF containing this match"""
+        return self.page.pdf
+    def __repr__(self):
+        return f"<Match page={self.page.number} confidence={self.confidence:.2f} bbox={self.bbox}>"
+class MatchResults:
+    """Collection of Match objects with transformation methods"""
+    def __init__(self, matches: List[Match]):
+        """Initialize with list of Match objects"""
+        # Import here to avoid circular import
+        from natural_pdf.elements.element_collection import ElementCollection
+        # Create a base ElementCollection
+        self._collection = ElementCollection(matches)
+        self._matches = matches
+    def __len__(self):
+        return len(self._matches)
+    def __iter__(self):
+        return iter(self._matches)
+    def __getitem__(self, key):
+        return self._matches[key]
+    def filter(self, filter_func) -> "MatchResults":
+        """Filter matches by a function"""
+        filtered = [m for m in self if filter_func(m)]
+        return MatchResults(filtered)
+    def filter_by_confidence(self, min_confidence: float) -> "MatchResults":
+        """Filter matches by minimum confidence"""
+        return self.filter(lambda m: m.confidence >= min_confidence)
+    def pages(self):
+        """Get unique pages containing matches"""
+        # Import here to avoid circular import
+        from natural_pdf.core.page_collection import PageCollection
+        # Get unique pages while preserving order
+        seen = set()
+        unique_pages = []
+        for match in self:
+            if match.page not in seen:
+                seen.add(match.page)
+                unique_pages.append(match.page)
+        # Attach matches to each page
+        for page in unique_pages:
+            page._matches = MatchResults([m for m in self if m.page == page])
+        return PageCollection(unique_pages)
+    def pdfs(self):
+        """Get unique PDFs containing matches"""
+        # Import here to avoid circular import
+        from natural_pdf.core.pdf_collection import PDFCollection
+        # Get unique PDFs while preserving order
+        seen = set()
+        unique_pdfs = []
+        for match in self:
+            if match.pdf not in seen:
+                seen.add(match.pdf)
+                unique_pdfs.append(match.pdf)
+        # Attach matches to each PDF
+        for pdf in unique_pdfs:
+            pdf._matches = MatchResults([m for m in self if m.pdf == pdf])
+        return PDFCollection(unique_pdfs)
+    def group_by_page(self) -> Iterator[Tuple[Any, "MatchResults"]]:
+        """Group matches by page"""
+        from itertools import groupby
+        # Sort by PDF filename and page number
+        sorted_matches = sorted(self, key=lambda m: (getattr(m.pdf, "filename", ""), m.page.number))
+        for page, matches in groupby(sorted_matches, key=lambda m: m.page):
+            yield page, MatchResults(list(matches))
+    def sort_by_confidence(self, descending: bool = True) -> "MatchResults":
+        """Sort matches by confidence score"""
+        sorted_matches = sorted(self, key=lambda m: m.confidence, reverse=descending)
+        return MatchResults(sorted_matches)
+    def regions(self):
+        """Get all matches as an ElementCollection of regions"""
+        # Import here to avoid circular import
+        from natural_pdf.elements.element_collection import ElementCollection
+        # Matches are already Region objects, so just wrap them
+        return ElementCollection(list(self))
+    def show(self, **kwargs):
+        """Show all matches using ElementCollection.show()"""
+        # Get regions and show them
+        return self.regions().show(**kwargs)
+    def __repr__(self):
+        if len(self) == 0:
+            return "<MatchResults: empty>"
+        elif len(self) == 1:
+            return f"<MatchResults: 1 match>"
+        else:
+            conf_range = (
+                f"{min(m.confidence for m in self):.2f}-{max(m.confidence for m in self):.2f}"
+            )
+            return f"<MatchResults: {len(self)} matches, confidence {conf_range}>"

natural_pdf/vision/similarity.py ADDED Viewed

@@ -0,0 +1,321 @@
+"""Visual similarity matching using perceptual hashing"""
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Tuple, Union
+import numpy as np
+from PIL import Image
+from tqdm.auto import tqdm
+@dataclass
+class MatchCandidate:
+    """Candidate match during sliding window search"""
+    bbox: Tuple[float, float, float, float]
+    hash_value: int
+    confidence: float
+def compute_phash(image: Image.Image, hash_size: int = 8, blur_radius: float = 0) -> int:
+    """
+    Compute perceptual hash of an image using DCT.
+    Args:
+        image: PIL Image to hash
+        hash_size: Size of the hash (8 = 64 bit hash)
+        blur_radius: Optional blur to apply before hashing (makes more tolerant)
+    Returns:
+        Integer hash value
+    """
+    # Convert to grayscale
+    if image.mode != "L":
+        image = image.convert("L")
+    # Optional blur to reduce sensitivity to minor variations
+    if blur_radius > 0:
+        from PIL import ImageFilter
+        image = image.filter(ImageFilter.GaussianBlur(radius=blur_radius))
+    # Resize to 32x32 (4x the hash size for DCT)
+    highfreq_factor = 4
+    img_size = hash_size * highfreq_factor
+    image = image.resize((img_size, img_size), Image.Resampling.LANCZOS)
+    # Convert to numpy array
+    pixels = np.array(image, dtype=np.float32)
+    # Apply DCT
+    from scipy.fftpack import dct
+    dct_coef = dct(dct(pixels, axis=0), axis=1)
+    # Keep top-left 8x8 (low frequencies)
+    dct_low = dct_coef[:hash_size, :hash_size]
+    # Compute median excluding the DC component
+    dct_low_no_dc = dct_low.flatten()[1:]  # Skip first element (DC)
+    median = np.median(dct_low_no_dc)
+    # Create binary hash
+    diff = dct_low.flatten() > median
+    # Convert to integer
+    return sum(2**i for i, v in enumerate(diff) if v)
+def hamming_distance(hash1: int, hash2: int, hash_size: int = 64) -> int:
+    """Calculate Hamming distance between two hashes"""
+    # XOR and count set bits
+    xor = hash1 ^ hash2
+    return bin(xor).count("1")
+def hash_similarity(hash1: int, hash2: int, hash_size: int = 64) -> float:
+    """Calculate similarity score between two hashes (0-1)"""
+    distance = hamming_distance(hash1, hash2, hash_size)
+    return 1.0 - (distance / hash_size)
+class VisualMatcher:
+    """Handles visual similarity matching using perceptual hashing"""
+    def __init__(self, hash_size: int = 12):
+        self.hash_size = hash_size
+        self.hash_bits = hash_size * hash_size
+        self._cache = {}
+    def _get_search_scales(self, sizes: Optional[Union[float, Tuple, List]]) -> List[float]:
+        """
+        Convert various size input formats to a list of scales to search.
+        Args:
+            sizes: Can be:
+                - None: just 1.0
+                - float: ±percentage (e.g., 0.2 = 80%-120%)
+                - tuple(min, max): range with smart logarithmic steps
+                - tuple(min, max, step): explicit step size
+                - list: exact sizes to use
+        Returns:
+            List of scale factors to search
+        """
+        if sizes is None:
+            return [1.0]
+        # List of exact sizes
+        if isinstance(sizes, list):
+            return sorted(sizes)
+        # Single float: ±percentage
+        if isinstance(sizes, (int, float)):
+            if sizes <= 0:
+                return [1.0]
+            # Convert to min/max range
+            min_scale = max(0.1, 1.0 - sizes)
+            max_scale = 1.0 + sizes
+            # Use tuple logic below
+            sizes = (min_scale, max_scale)
+        # Tuple handling
+        if isinstance(sizes, tuple):
+            if len(sizes) == 2:
+                min_scale, max_scale = sizes
+                if min_scale >= max_scale:
+                    return [min_scale]
+                # Smart defaults with logarithmic spacing
+                # Calculate range ratio to determine number of steps
+                ratio = max_scale / min_scale
+                if ratio <= 1.5:  # Small range (e.g., 0.8-1.2)
+                    num_steps = 5
+                elif ratio <= 3.0:  # Medium range (e.g., 0.5-1.5)
+                    num_steps = 7
+                else:  # Large range (e.g., 0.5-2.0)
+                    num_steps = 9
+                # Generate logarithmically spaced scales
+                log_min = np.log(min_scale)
+                log_max = np.log(max_scale)
+                log_scales = np.linspace(log_min, log_max, num_steps)
+                scales = np.exp(log_scales).tolist()
+                # Ensure 1.0 is included if in range
+                if min_scale <= 1.0 <= max_scale and 1.0 not in scales:
+                    # Find closest scale and replace with 1.0
+                    closest_idx = np.argmin([abs(s - 1.0) for s in scales])
+                    scales[closest_idx] = 1.0
+                return scales
+            elif len(sizes) == 3:
+                # Explicit (min, max, step)
+                min_scale, max_scale, step = sizes
+                scales = []
+                current = min_scale
+                while current <= max_scale:
+                    scales.append(current)
+                    current += step
+                # Ensure max is included if close
+                if scales[-1] < max_scale and (max_scale - scales[-1]) < step * 0.1:
+                    scales[-1] = max_scale
+                return scales
+        raise ValueError(f"Invalid sizes format: {sizes}")
+    def find_matches_in_image(
+        self,
+        template: Image.Image,
+        target: Image.Image,
+        template_hash: Optional[int] = None,
+        confidence_threshold: float = 0.6,
+        step_factor: float = 0.1,
+        sizes: Optional[Union[float, Tuple, List]] = None,
+        show_progress: bool = True,
+        progress_callback: Optional[Callable[[], None]] = None,
+    ) -> List[MatchCandidate]:
+        """
+        Find all matches of template in target image using sliding window.
+        Args:
+            template: Template image to search for
+            target: Target image to search in
+            template_hash: Pre-computed hash of template (optional)
+            confidence_threshold: Minimum similarity score (0-1)
+            step_factor: Step size as fraction of template size
+            sizes: Size variations to search. Can be:
+                   - float: ±percentage (e.g., 0.2 = 80%-120%)
+                   - tuple(min, max): search range with smart logarithmic steps
+                   - tuple(min, max, step): explicit step size
+                   - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
+            show_progress: Show progress bar for sliding window search
+            progress_callback: Optional callback function to call for each window checked
+        Returns:
+            List of MatchCandidate objects
+        """
+        matches = []
+        # Compute template hash if not provided
+        if template_hash is None:
+            template_hash = compute_phash(template, self.hash_size)
+        template_w, template_h = template.size
+        target_w, target_h = target.size
+        # Determine scales to search
+        scales = self._get_search_scales(sizes)
+        # Calculate total iterations for progress bar
+        total_iterations = 0
+        if show_progress and not progress_callback:
+            for scale in scales:
+                scaled_w = int(template_w * scale)
+                scaled_h = int(template_h * scale)
+                if scaled_w <= target_w and scaled_h <= target_h:
+                    step_x = max(1, int(scaled_w * step_factor))
+                    step_y = max(1, int(scaled_h * step_factor))
+                    x_steps = len(range(0, target_w - scaled_w + 1, step_x))
+                    y_steps = len(range(0, target_h - scaled_h + 1, step_y))
+                    total_iterations += x_steps * y_steps
+        # Setup progress bar if needed (only if no callback provided)
+        progress_bar = None
+        if show_progress and not progress_callback and total_iterations > 0:
+            progress_bar = tqdm(total=total_iterations, desc="Scanning", unit="window", leave=False)
+        # Search at each scale
+        for scale in scales:
+            # Scale template size
+            scaled_w = int(template_w * scale)
+            scaled_h = int(template_h * scale)
+            if scaled_w > target_w or scaled_h > target_h:
+                continue
+            # Calculate step size
+            step_x = max(1, int(scaled_w * step_factor))
+            step_y = max(1, int(scaled_h * step_factor))
+            # Sliding window search
+            for y in range(0, target_h - scaled_h + 1, step_y):
+                for x in range(0, target_w - scaled_w + 1, step_x):
+                    # Extract window
+                    window = target.crop((x, y, x + scaled_w, y + scaled_h))
+                    # Resize to template size if scaled
+                    if scale != 1.0:
+                        window = window.resize((template_w, template_h), Image.Resampling.LANCZOS)
+                    # Compute hash and similarity
+                    window_hash = compute_phash(window, self.hash_size)
+                    similarity = hash_similarity(template_hash, window_hash, self.hash_bits)
+                    if similarity >= confidence_threshold:
+                        # Convert back to target image coordinates
+                        bbox = (x, y, x + scaled_w, y + scaled_h)
+                        matches.append(MatchCandidate(bbox, window_hash, similarity))
+                    # Update progress
+                    if progress_bar:
+                        progress_bar.update(1)
+                    elif progress_callback:
+                        progress_callback()
+        # Close progress bar
+        if progress_bar:
+            progress_bar.close()
+        # Remove overlapping matches (keep highest confidence)
+        return self._filter_overlapping_matches(matches)
+    def _filter_overlapping_matches(
+        self, matches: List[MatchCandidate], overlap_threshold: float = 0.5
+    ) -> List[MatchCandidate]:
+        """Remove overlapping matches, keeping the highest confidence ones"""
+        if not matches:
+            return matches
+        # Sort by confidence (highest first)
+        sorted_matches = sorted(matches, key=lambda m: m.confidence, reverse=True)
+        filtered = []
+        for candidate in sorted_matches:
+            # Check if this overlaps significantly with any already selected match
+            keep = True
+            for selected in filtered:
+                overlap = self._calculate_overlap(candidate.bbox, selected.bbox)
+                if overlap > overlap_threshold:
+                    keep = False
+                    break
+            if keep:
+                filtered.append(candidate)
+        return filtered
+    def _calculate_overlap(self, bbox1: Tuple, bbox2: Tuple) -> float:
+        """Calculate intersection over union (IoU) for two bboxes"""
+        x1_min, y1_min, x1_max, y1_max = bbox1
+        x2_min, y2_min, x2_max, y2_max = bbox2
+        # Calculate intersection
+        intersect_xmin = max(x1_min, x2_min)
+        intersect_ymin = max(y1_min, y2_min)
+        intersect_xmax = min(x1_max, x2_max)
+        intersect_ymax = min(y1_max, y2_max)
+        if intersect_xmax < intersect_xmin or intersect_ymax < intersect_ymin:
+            return 0.0
+        intersect_area = (intersect_xmax - intersect_xmin) * (intersect_ymax - intersect_ymin)
+        # Calculate union
+        area1 = (x1_max - x1_min) * (y1_max - y1_min)
+        area2 = (x2_max - x2_min) * (y2_max - y2_min)
+        union_area = area1 + area2 - intersect_area
+        return intersect_area / union_area if union_area > 0 else 0.0

{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.2.3
+Version: 0.2.4
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/RECORD RENAMED Viewed

@@ -27,24 +27,24 @@ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
 natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
 natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
-natural_pdf/core/page.py,sha256=4-il2WPMVX4hNSgQ5P6yLc1-3jXfi73WCrpF9912ct4,142472
+natural_pdf/core/page.py,sha256=XrDePXZgXgB3w8hvxh4-EhPQnrwmw-0z-I_K24__OtY,142550
 natural_pdf/core/page_collection.py,sha256=hEeXs_fzB73XZ8ZkHz2kIuSgBYcVYydvGMMdGuB1rvw,52486
 natural_pdf/core/page_groupby.py,sha256=550ME6kd-h-2u75oUIIIqTYsmh8VvdQO1nXXioL8J6A,7378
-natural_pdf/core/pdf.py,sha256=q54DyhXwAS_zAmsBd3PsCezu1wyQOYmGmB3iKfP8gAM,101884
-natural_pdf/core/pdf_collection.py,sha256=8tM0qVWS1L5Hwv5cXuZ2X8znAYOjKmlERX62bksDlJU,30144
-natural_pdf/core/render_spec.py,sha256=3GTfnlv8JKzePrruLq_dNr3HFeWMVcZT2fwWmJN44NI,14456
+natural_pdf/core/pdf.py,sha256=Loe6sbQzBp9VDeIAuDS3zQmeDWvQMj5SWIQMky5bPDA,101964
+natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
+natural_pdf/core/render_spec.py,sha256=rLicaS9EPyojpJcjy2Lzn5DLWQwjrFyDJyRo7jbjdGU,14505
 natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
 natural_pdf/describe/base.py,sha256=Of9WVo9XuShXoeyJr0RN2CpLhF_CeiOjazl-or53RKU,18173
 natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
 natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
 natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
 natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
-natural_pdf/elements/base.py,sha256=xXdNV1_gt4T_V_4m6qJDieWiysvJxUBhSEEAJzMOzqo,55094
+natural_pdf/elements/base.py,sha256=aj-eXOQQlhKv9lYeUlUs9aKNcUebtG_dqxURZHZVZ58,55509
 natural_pdf/elements/element_collection.py,sha256=slCUnOT04sNOTjSGgmhjcCKKPVPtdDPwU7PX1ebzGMw,101342
 natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
 natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
 natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
-natural_pdf/elements/region.py,sha256=Onok5VzmF1CvMCa3UGLUszCuhL-CCGk_IgtSUDva-Cc,155314
+natural_pdf/elements/region.py,sha256=RxWidI7oNrdbuuj94SfdFXmcSDTfy89uGCeVMQvAfks,155591
 natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
 natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
 natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -100,9 +100,13 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
 natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
 natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
 natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
+natural_pdf/vision/__init__.py,sha256=RymMY-3WLQBlOZ4Dx4MmL9UH6I65hNjkwUJ7ymO5JfM,287
+natural_pdf/vision/mixin.py,sha256=OJwBABr74TWxP5seTKUmGj5zE9mWsBP_UKWU-Pr8V9A,8720
+natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs,5119
+natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
 natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
 natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
-natural_pdf-0.2.3.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.2.4.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
 optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
 optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
 optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -119,8 +123,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
 tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
 tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
 tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
-natural_pdf-0.2.3.dist-info/METADATA,sha256=lyx6Cx1xPGhy-p1m0wRfTvv4zSJ4ZJnNo7DeGQZ99yU,6959
-natural_pdf-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.2.3.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
-natural_pdf-0.2.3.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
-natural_pdf-0.2.3.dist-info/RECORD,,
+natural_pdf-0.2.4.dist-info/METADATA,sha256=G1tmes61GVEt6zLeDISuJZgceLQywIU-uRspGA_90Q8,6959
+natural_pdf-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.2.4.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
+natural_pdf-0.2.4.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
+natural_pdf-0.2.4.dist-info/RECORD,,

{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

natural-pdf 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl