natural-pdf 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. natural_pdf/analyzers/guides.py +196 -43
  2. natural_pdf/core/highlighting_service.py +40 -10
  3. natural_pdf/core/page.py +56 -8
  4. natural_pdf/elements/base.py +15 -1
  5. natural_pdf/elements/region.py +37 -5
  6. natural_pdf/vision/__init__.py +1 -2
  7. natural_pdf/vision/mixin.py +67 -27
  8. natural_pdf/vision/results.py +49 -5
  9. natural_pdf/vision/similarity.py +195 -23
  10. natural_pdf/vision/template_matching.py +209 -0
  11. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/METADATA +1 -1
  12. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/RECORD +36 -15
  13. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/top_level.txt +1 -0
  14. temp/fix_page_exclusions.py +42 -0
  15. temp/test_draw_guides.py +25 -0
  16. temp/test_draw_guides_interactive.py +30 -0
  17. temp/test_exclusion_with_debug.py +30 -0
  18. temp/test_find_exclusions_fix.py +53 -0
  19. temp/test_find_exclusions_fix_no_recursion.py +97 -0
  20. temp/test_fix_real_pdf.py +48 -0
  21. temp/test_fix_working.py +55 -0
  22. temp/test_fixed_pdf_exclusions.py +67 -0
  23. temp/test_guide_draw_notebook.py +47 -0
  24. temp/test_horizontal_top_bottom.py +53 -0
  25. temp/test_inline_js.py +22 -0
  26. temp/test_marker_order.py +45 -0
  27. temp/test_original_exclusions_now_work.py +56 -0
  28. temp/test_pdf_exclusions_with_guides.py +84 -0
  29. temp/test_region_exclusions_detailed.py +25 -0
  30. temp/test_stripes_real_pdf.py +62 -0
  31. temp/test_vertical_stripes.py +55 -0
  32. temp/test_widget_functionality.py +68 -0
  33. temp/test_widget_simple.py +41 -0
  34. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/WHEEL +0 -0
  35. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/entry_points.txt +0 -0
  36. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/licenses/LICENSE +0 -0
@@ -45,6 +45,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
45
45
 
46
46
  # Import new utils
47
47
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
48
+ from natural_pdf.vision.mixin import VisualSearchMixin
48
49
 
49
50
  # Import viewer widget support
50
51
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
@@ -80,6 +81,7 @@ class Region(
80
81
  ExtractionMixin,
81
82
  ShapeDetectionMixin,
82
83
  DescribeMixin,
84
+ VisualSearchMixin,
83
85
  Visualizable,
84
86
  ):
85
87
  """Represents a rectangular region on a page.
@@ -1270,7 +1272,8 @@ class Region(
1270
1272
  # 3. Get Relevant Exclusions (overlapping this region)
1271
1273
  apply_exclusions_flag = kwargs.get("apply_exclusions", apply_exclusions)
1272
1274
  exclusion_regions = []
1273
- if apply_exclusions_flag and self._page._exclusions:
1275
+ if apply_exclusions_flag:
1276
+ # Always call _get_exclusion_regions to get both page and PDF level exclusions
1274
1277
  all_page_exclusions = self._page._get_exclusion_regions(
1275
1278
  include_callable=True, debug=debug
1276
1279
  )
@@ -1281,10 +1284,11 @@ class Region(
1281
1284
  exclusion_regions = overlapping_exclusions
1282
1285
  if debug:
1283
1286
  logger.debug(
1284
- f"Region {self.bbox}: Applying {len(exclusion_regions)} overlapping exclusions."
1287
+ f"Region {self.bbox}: Found {len(all_page_exclusions)} total exclusions, "
1288
+ f"{len(exclusion_regions)} overlapping this region."
1285
1289
  )
1286
1290
  elif debug:
1287
- logger.debug(f"Region {self.bbox}: Not applying exclusions.")
1291
+ logger.debug(f"Region {self.bbox}: Not applying exclusions (apply_exclusions=False).")
1288
1292
 
1289
1293
  # 4. Spatially Filter Characters using Utility
1290
1294
  # Pass self as the target_region for precise polygon checks etc.
@@ -1690,7 +1694,21 @@ class Region(
1690
1694
  else:
1691
1695
  filtered_page = base_plumber_page
1692
1696
 
1693
- cropped = filtered_page.crop(self.bbox)
1697
+ # Ensure bbox is within pdfplumber page bounds
1698
+ page_bbox = filtered_page.bbox
1699
+ clipped_bbox = (
1700
+ max(self.bbox[0], page_bbox[0]), # x0
1701
+ max(self.bbox[1], page_bbox[1]), # y0
1702
+ min(self.bbox[2], page_bbox[2]), # x1
1703
+ min(self.bbox[3], page_bbox[3]), # y1
1704
+ )
1705
+
1706
+ # Only crop if the clipped bbox is valid (has positive width and height)
1707
+ if clipped_bbox[2] > clipped_bbox[0] and clipped_bbox[3] > clipped_bbox[1]:
1708
+ cropped = filtered_page.crop(clipped_bbox)
1709
+ else:
1710
+ # If the region is completely outside the page bounds, return empty list
1711
+ return []
1694
1712
 
1695
1713
  # Extract all tables from the cropped area
1696
1714
  tables = cropped.extract_tables(table_settings)
@@ -1784,7 +1802,21 @@ class Region(
1784
1802
  filtered_page = base_plumber_page
1785
1803
 
1786
1804
  # Now crop the (possibly filtered) page to the region bbox
1787
- cropped = filtered_page.crop(self.bbox)
1805
+ # Ensure bbox is within pdfplumber page bounds
1806
+ page_bbox = filtered_page.bbox
1807
+ clipped_bbox = (
1808
+ max(self.bbox[0], page_bbox[0]), # x0
1809
+ max(self.bbox[1], page_bbox[1]), # y0
1810
+ min(self.bbox[2], page_bbox[2]), # x1
1811
+ min(self.bbox[3], page_bbox[3]), # y1
1812
+ )
1813
+
1814
+ # Only crop if the clipped bbox is valid (has positive width and height)
1815
+ if clipped_bbox[2] > clipped_bbox[0] and clipped_bbox[3] > clipped_bbox[1]:
1816
+ cropped = filtered_page.crop(clipped_bbox)
1817
+ else:
1818
+ # If the region is completely outside the page bounds, return empty table
1819
+ return []
1788
1820
 
1789
1821
  # Extract the single largest table from the cropped area
1790
1822
  table = cropped.extract_table(table_settings)
@@ -1,7 +1,6 @@
1
1
  """Vision module for visual similarity and pattern matching"""
2
2
 
3
3
  from .mixin import VisualSearchMixin
4
- from .results import Match, MatchResults
5
4
  from .similarity import VisualMatcher, compute_phash
6
5
 
7
- __all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]
6
+ __all__ = ["VisualMatcher", "compute_phash", "VisualSearchMixin"]
@@ -6,9 +6,6 @@ import numpy as np
6
6
  from PIL import Image
7
7
  from tqdm.auto import tqdm
8
8
 
9
- from .results import Match, MatchResults
10
- from .similarity import VisualMatcher, compute_phash
11
-
12
9
 
13
10
  class VisualSearchMixin:
14
11
  """Add find_similar method to classes that include this mixin"""
@@ -21,11 +18,12 @@ class VisualSearchMixin:
21
18
  sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
22
19
  resolution: int = 72,
23
20
  hash_size: int = 20,
24
- step_factor: float = 0.1,
21
+ step: Optional[int] = None,
22
+ method: str = "phash",
25
23
  max_per_page: Optional[int] = None,
26
24
  show_progress: bool = True,
27
25
  **kwargs,
28
- ) -> MatchResults:
26
+ ) -> "MatchResults":
29
27
  """
30
28
  Find regions visually similar to the given example(s).
31
29
 
@@ -35,15 +33,19 @@ class VisualSearchMixin:
35
33
  confidence: Minimum similarity score (0-1)
36
34
  sizes: Size variations to search. Can be:
37
35
  - float: ±percentage (e.g., 0.2 = 80%-120%)
38
- - tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
36
+ - tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.2))
39
37
  - tuple(min, max, step): explicit step size
40
38
  - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
41
39
  resolution: Resolution for image comparison (DPI) (default: 72)
42
- hash_size: Size of perceptual hash grid (default: 12)
43
- step_factor: Step size as fraction of template size (default: 0.1)
40
+ hash_size: Size of perceptual hash grid (default: 20)
41
+ step: Step size in pixels for sliding window
42
+ method: Matching algorithm - "phash" (default) or "template"
44
43
  max_per_page: Maximum matches to return per page
45
44
  show_progress: Show progress bar for multi-page searches (default: True)
46
- **kwargs: Additional options
45
+ **kwargs: Additional options including:
46
+ mask_threshold: For both template and phash methods, pixels >= this value are masked.
47
+ For template matching: pixels are ignored in matching (e.g., 0.95)
48
+ For phash: pixels are replaced with median before hashing (e.g., 0.95)
47
49
 
48
50
  Returns:
49
51
  MatchResults collection
@@ -55,15 +57,25 @@ class VisualSearchMixin:
55
57
  if not isinstance(examples, list):
56
58
  examples = [examples]
57
59
 
60
+ from .similarity import VisualMatcher, compute_phash
61
+
58
62
  # Initialize matcher with specified hash size
59
63
  matcher = VisualMatcher(hash_size=hash_size)
60
64
 
61
65
  # Prepare templates
62
66
  templates = []
67
+ # Extract mask_threshold from kwargs for phash
68
+ mask_threshold = kwargs.get("mask_threshold")
69
+ mask_threshold_255 = (
70
+ int(mask_threshold * 255) if mask_threshold is not None and method == "phash" else None
71
+ )
72
+
63
73
  for example in examples:
64
74
  # Render the example region/element
65
75
  example_image = example.render(resolution=resolution, crop=True)
66
- template_hash = compute_phash(example_image, hash_size=hash_size)
76
+ template_hash = compute_phash(
77
+ example_image, hash_size=hash_size, mask_threshold=mask_threshold_255
78
+ )
67
79
  templates.append({"image": example_image, "hash": template_hash, "source": example})
68
80
 
69
81
  # Get pages to search based on the object type
@@ -76,6 +88,8 @@ class VisualSearchMixin:
76
88
  pages_to_search = self.pages
77
89
  elif hasattr(self, "number"): # Single page
78
90
  pages_to_search = [self]
91
+ elif hasattr(self, "page") and hasattr(self, "bbox"): # Region
92
+ pages_to_search = [self]
79
93
  else:
80
94
  raise TypeError(f"Cannot search in {type(self)}")
81
95
 
@@ -86,10 +100,16 @@ class VisualSearchMixin:
86
100
  scales = matcher._get_search_scales(sizes)
87
101
 
88
102
  # Pre-calculate for all pages and templates
89
- for page in pages_to_search:
90
- # Estimate page image size
91
- page_w = int(page.width * resolution / 72.0)
92
- page_h = int(page.height * resolution / 72.0)
103
+ for search_obj in pages_to_search:
104
+ # Estimate image size based on object type
105
+ if hasattr(search_obj, "page") and hasattr(search_obj, "bbox"):
106
+ # Region
107
+ page_w = int(search_obj.width * resolution / 72.0)
108
+ page_h = int(search_obj.height * resolution / 72.0)
109
+ else:
110
+ # Page
111
+ page_w = int(search_obj.width * resolution / 72.0)
112
+ page_h = int(search_obj.height * resolution / 72.0)
93
113
 
94
114
  for template_data in templates:
95
115
  template_w, template_h = template_data["image"].size
@@ -99,11 +119,15 @@ class VisualSearchMixin:
99
119
  scaled_h = int(template_h * scale)
100
120
 
101
121
  if scaled_w <= page_w and scaled_h <= page_h:
102
- step_x = max(1, int(scaled_w * step_factor))
103
- step_y = max(1, int(scaled_h * step_factor))
104
-
105
- x_windows = len(range(0, page_w - scaled_w + 1, step_x))
106
- y_windows = len(range(0, page_h - scaled_h + 1, step_y))
122
+ # Determine step size
123
+ if step is not None:
124
+ actual_step = step
125
+ else:
126
+ # Default to 10% of template size
127
+ actual_step = max(1, int(min(scaled_w, scaled_h) * 0.1))
128
+
129
+ x_windows = len(range(0, page_w - scaled_w + 1, actual_step))
130
+ y_windows = len(range(0, page_h - scaled_h + 1, actual_step))
107
131
  total_operations += x_windows * y_windows
108
132
 
109
133
  # Search each page
@@ -124,9 +148,20 @@ class VisualSearchMixin:
124
148
  mininterval=0.1, # Minimum time between updates (seconds)
125
149
  )
126
150
 
127
- for page_idx, page in enumerate(pages_to_search):
128
- # Render the full page once
129
- page_image = page.render(resolution=resolution)
151
+ for page_idx, search_obj in enumerate(pages_to_search):
152
+ # Determine if we're searching in a page or a region
153
+ if hasattr(search_obj, "page") and hasattr(search_obj, "bbox"):
154
+ # This is a Region - render only the region area
155
+ region = search_obj
156
+ page = region.page
157
+ page_image = region.render(resolution=resolution, crop=True)
158
+ # Region offset for coordinate conversion
159
+ region_x0, region_y0 = region.x0, region.top
160
+ else:
161
+ # This is a Page - render the full page
162
+ page = search_obj
163
+ page_image = page.render(resolution=resolution)
164
+ region_x0, region_y0 = 0, 0
130
165
 
131
166
  # Convert page coordinates to image coordinates
132
167
  scale = resolution / 72.0 # PDF is 72 DPI
@@ -168,7 +203,8 @@ class VisualSearchMixin:
168
203
  template_hash=template_hash,
169
204
  confidence_threshold=confidence,
170
205
  sizes=sizes,
171
- step_factor=step_factor,
206
+ step=step,
207
+ method=method,
172
208
  show_progress=False, # We handle progress ourselves
173
209
  progress_callback=update_progress if progress_bar else None,
174
210
  **kwargs,
@@ -180,10 +216,12 @@ class VisualSearchMixin:
180
216
 
181
217
  # Convert from image pixels to PDF points
182
218
  # No flipping needed! PDF coordinates map directly to PIL coordinates
183
- pdf_x0 = img_x0 / scale
184
- pdf_y0 = img_y0 / scale
185
- pdf_x1 = img_x1 / scale
186
- pdf_y1 = img_y1 / scale
219
+ pdf_x0 = img_x0 / scale + region_x0
220
+ pdf_y0 = img_y0 / scale + region_y0
221
+ pdf_x1 = img_x1 / scale + region_x0
222
+ pdf_y1 = img_y1 / scale + region_y0
223
+
224
+ from .results import Match
187
225
 
188
226
  # Create Match object
189
227
  match = Match(
@@ -206,4 +244,6 @@ class VisualSearchMixin:
206
244
  if progress_bar:
207
245
  progress_bar.close()
208
246
 
247
+ from .results import MatchResults
248
+
209
249
  return MatchResults(all_matches)
@@ -2,7 +2,6 @@
2
2
 
3
3
  from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple
4
4
 
5
- # Import Region directly as it's a base class
6
5
  from natural_pdf.elements.region import Region
7
6
 
8
7
  if TYPE_CHECKING:
@@ -39,16 +38,41 @@ class Match(Region):
39
38
 
40
39
 
41
40
  class MatchResults:
42
- """Collection of Match objects with transformation methods"""
41
+ """
42
+ Collection of Match objects with transformation methods.
43
+
44
+ Matches are automatically sorted by confidence (highest first), so:
45
+ - matches[0] is the best match
46
+ - Iteration yields matches from best to worst
47
+ - The .top(n) method returns the n best matches
48
+
49
+ Example:
50
+ >>> matches = page.find_similar(logo_region)
51
+ >>> print(f"Found {len(matches)} matches")
52
+ >>>
53
+ >>> # Best match
54
+ >>> best = matches[0]
55
+ >>> print(f"Best match confidence: {best.confidence:.3f}")
56
+ >>>
57
+ >>> # Top 5 matches
58
+ >>> for match in matches.top(5):
59
+ ... print(f"Confidence: {match.confidence:.3f} at page {match.page.number}")
60
+ >>>
61
+ >>> # All matches above 90% confidence
62
+ >>> high_conf = matches.filter_by_confidence(0.9)
63
+ """
43
64
 
44
65
  def __init__(self, matches: List[Match]):
45
- """Initialize with list of Match objects"""
66
+ """Initialize with list of Match objects, automatically sorted by confidence"""
46
67
  # Import here to avoid circular import
47
68
  from natural_pdf.elements.element_collection import ElementCollection
48
69
 
70
+ # Sort matches by confidence (highest first)
71
+ sorted_matches = sorted(matches, key=lambda m: m.confidence, reverse=True)
72
+
49
73
  # Create a base ElementCollection
50
- self._collection = ElementCollection(matches)
51
- self._matches = matches
74
+ self._collection = ElementCollection(sorted_matches)
75
+ self._matches = sorted_matches
52
76
 
53
77
  def __len__(self):
54
78
  return len(self._matches)
@@ -68,6 +92,26 @@ class MatchResults:
68
92
  """Filter matches by minimum confidence"""
69
93
  return self.filter(lambda m: m.confidence >= min_confidence)
70
94
 
95
+ def top(self, n: int) -> "MatchResults":
96
+ """
97
+ Get the top N matches with highest confidence.
98
+
99
+ Args:
100
+ n: Number of top matches to return
101
+
102
+ Returns:
103
+ New MatchResults with only the top N matches
104
+
105
+ Example:
106
+ >>> matches = page.find_similar(logo)
107
+ >>> best_5 = matches.top(5)
108
+ >>> for match in best_5:
109
+ ... print(f"Confidence: {match.confidence:.3f}")
110
+ """
111
+ # Since matches are already sorted by confidence, just take first n
112
+ top_matches = self._matches[:n]
113
+ return MatchResults(top_matches)
114
+
71
115
  def pages(self):
72
116
  """Get unique pages containing matches"""
73
117
  # Import here to avoid circular import