natural-pdf 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +196 -43
- natural_pdf/core/highlighting_service.py +40 -10
- natural_pdf/core/page.py +56 -8
- natural_pdf/elements/base.py +15 -1
- natural_pdf/elements/region.py +37 -5
- natural_pdf/vision/__init__.py +1 -2
- natural_pdf/vision/mixin.py +67 -27
- natural_pdf/vision/results.py +49 -5
- natural_pdf/vision/similarity.py +195 -23
- natural_pdf/vision/template_matching.py +209 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/RECORD +36 -15
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/top_level.txt +1 -0
- temp/fix_page_exclusions.py +42 -0
- temp/test_draw_guides.py +25 -0
- temp/test_draw_guides_interactive.py +30 -0
- temp/test_exclusion_with_debug.py +30 -0
- temp/test_find_exclusions_fix.py +53 -0
- temp/test_find_exclusions_fix_no_recursion.py +97 -0
- temp/test_fix_real_pdf.py +48 -0
- temp/test_fix_working.py +55 -0
- temp/test_fixed_pdf_exclusions.py +67 -0
- temp/test_guide_draw_notebook.py +47 -0
- temp/test_horizontal_top_bottom.py +53 -0
- temp/test_inline_js.py +22 -0
- temp/test_marker_order.py +45 -0
- temp/test_original_exclusions_now_work.py +56 -0
- temp/test_pdf_exclusions_with_guides.py +84 -0
- temp/test_region_exclusions_detailed.py +25 -0
- temp/test_stripes_real_pdf.py +62 -0
- temp/test_vertical_stripes.py +55 -0
- temp/test_widget_functionality.py +68 -0
- temp/test_widget_simple.py +41 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -45,6 +45,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
45
45
|
|
46
46
|
# Import new utils
|
47
47
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
48
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
48
49
|
|
49
50
|
# Import viewer widget support
|
50
51
|
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
|
@@ -80,6 +81,7 @@ class Region(
|
|
80
81
|
ExtractionMixin,
|
81
82
|
ShapeDetectionMixin,
|
82
83
|
DescribeMixin,
|
84
|
+
VisualSearchMixin,
|
83
85
|
Visualizable,
|
84
86
|
):
|
85
87
|
"""Represents a rectangular region on a page.
|
@@ -1270,7 +1272,8 @@ class Region(
|
|
1270
1272
|
# 3. Get Relevant Exclusions (overlapping this region)
|
1271
1273
|
apply_exclusions_flag = kwargs.get("apply_exclusions", apply_exclusions)
|
1272
1274
|
exclusion_regions = []
|
1273
|
-
if apply_exclusions_flag
|
1275
|
+
if apply_exclusions_flag:
|
1276
|
+
# Always call _get_exclusion_regions to get both page and PDF level exclusions
|
1274
1277
|
all_page_exclusions = self._page._get_exclusion_regions(
|
1275
1278
|
include_callable=True, debug=debug
|
1276
1279
|
)
|
@@ -1281,10 +1284,11 @@ class Region(
|
|
1281
1284
|
exclusion_regions = overlapping_exclusions
|
1282
1285
|
if debug:
|
1283
1286
|
logger.debug(
|
1284
|
-
f"Region {self.bbox}:
|
1287
|
+
f"Region {self.bbox}: Found {len(all_page_exclusions)} total exclusions, "
|
1288
|
+
f"{len(exclusion_regions)} overlapping this region."
|
1285
1289
|
)
|
1286
1290
|
elif debug:
|
1287
|
-
logger.debug(f"Region {self.bbox}: Not applying exclusions.")
|
1291
|
+
logger.debug(f"Region {self.bbox}: Not applying exclusions (apply_exclusions=False).")
|
1288
1292
|
|
1289
1293
|
# 4. Spatially Filter Characters using Utility
|
1290
1294
|
# Pass self as the target_region for precise polygon checks etc.
|
@@ -1690,7 +1694,21 @@ class Region(
|
|
1690
1694
|
else:
|
1691
1695
|
filtered_page = base_plumber_page
|
1692
1696
|
|
1693
|
-
|
1697
|
+
# Ensure bbox is within pdfplumber page bounds
|
1698
|
+
page_bbox = filtered_page.bbox
|
1699
|
+
clipped_bbox = (
|
1700
|
+
max(self.bbox[0], page_bbox[0]), # x0
|
1701
|
+
max(self.bbox[1], page_bbox[1]), # y0
|
1702
|
+
min(self.bbox[2], page_bbox[2]), # x1
|
1703
|
+
min(self.bbox[3], page_bbox[3]), # y1
|
1704
|
+
)
|
1705
|
+
|
1706
|
+
# Only crop if the clipped bbox is valid (has positive width and height)
|
1707
|
+
if clipped_bbox[2] > clipped_bbox[0] and clipped_bbox[3] > clipped_bbox[1]:
|
1708
|
+
cropped = filtered_page.crop(clipped_bbox)
|
1709
|
+
else:
|
1710
|
+
# If the region is completely outside the page bounds, return empty list
|
1711
|
+
return []
|
1694
1712
|
|
1695
1713
|
# Extract all tables from the cropped area
|
1696
1714
|
tables = cropped.extract_tables(table_settings)
|
@@ -1784,7 +1802,21 @@ class Region(
|
|
1784
1802
|
filtered_page = base_plumber_page
|
1785
1803
|
|
1786
1804
|
# Now crop the (possibly filtered) page to the region bbox
|
1787
|
-
|
1805
|
+
# Ensure bbox is within pdfplumber page bounds
|
1806
|
+
page_bbox = filtered_page.bbox
|
1807
|
+
clipped_bbox = (
|
1808
|
+
max(self.bbox[0], page_bbox[0]), # x0
|
1809
|
+
max(self.bbox[1], page_bbox[1]), # y0
|
1810
|
+
min(self.bbox[2], page_bbox[2]), # x1
|
1811
|
+
min(self.bbox[3], page_bbox[3]), # y1
|
1812
|
+
)
|
1813
|
+
|
1814
|
+
# Only crop if the clipped bbox is valid (has positive width and height)
|
1815
|
+
if clipped_bbox[2] > clipped_bbox[0] and clipped_bbox[3] > clipped_bbox[1]:
|
1816
|
+
cropped = filtered_page.crop(clipped_bbox)
|
1817
|
+
else:
|
1818
|
+
# If the region is completely outside the page bounds, return empty table
|
1819
|
+
return []
|
1788
1820
|
|
1789
1821
|
# Extract the single largest table from the cropped area
|
1790
1822
|
table = cropped.extract_table(table_settings)
|
natural_pdf/vision/__init__.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
"""Vision module for visual similarity and pattern matching"""
|
2
2
|
|
3
3
|
from .mixin import VisualSearchMixin
|
4
|
-
from .results import Match, MatchResults
|
5
4
|
from .similarity import VisualMatcher, compute_phash
|
6
5
|
|
7
|
-
__all__ = ["VisualMatcher", "compute_phash", "
|
6
|
+
__all__ = ["VisualMatcher", "compute_phash", "VisualSearchMixin"]
|
natural_pdf/vision/mixin.py
CHANGED
@@ -6,9 +6,6 @@ import numpy as np
|
|
6
6
|
from PIL import Image
|
7
7
|
from tqdm.auto import tqdm
|
8
8
|
|
9
|
-
from .results import Match, MatchResults
|
10
|
-
from .similarity import VisualMatcher, compute_phash
|
11
|
-
|
12
9
|
|
13
10
|
class VisualSearchMixin:
|
14
11
|
"""Add find_similar method to classes that include this mixin"""
|
@@ -21,11 +18,12 @@ class VisualSearchMixin:
|
|
21
18
|
sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
|
22
19
|
resolution: int = 72,
|
23
20
|
hash_size: int = 20,
|
24
|
-
|
21
|
+
step: Optional[int] = None,
|
22
|
+
method: str = "phash",
|
25
23
|
max_per_page: Optional[int] = None,
|
26
24
|
show_progress: bool = True,
|
27
25
|
**kwargs,
|
28
|
-
) -> MatchResults:
|
26
|
+
) -> "MatchResults":
|
29
27
|
"""
|
30
28
|
Find regions visually similar to the given example(s).
|
31
29
|
|
@@ -35,15 +33,19 @@ class VisualSearchMixin:
|
|
35
33
|
confidence: Minimum similarity score (0-1)
|
36
34
|
sizes: Size variations to search. Can be:
|
37
35
|
- float: ±percentage (e.g., 0.2 = 80%-120%)
|
38
|
-
- tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.
|
36
|
+
- tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.2))
|
39
37
|
- tuple(min, max, step): explicit step size
|
40
38
|
- list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
|
41
39
|
resolution: Resolution for image comparison (DPI) (default: 72)
|
42
|
-
hash_size: Size of perceptual hash grid (default:
|
43
|
-
|
40
|
+
hash_size: Size of perceptual hash grid (default: 20)
|
41
|
+
step: Step size in pixels for sliding window
|
42
|
+
method: Matching algorithm - "phash" (default) or "template"
|
44
43
|
max_per_page: Maximum matches to return per page
|
45
44
|
show_progress: Show progress bar for multi-page searches (default: True)
|
46
|
-
**kwargs: Additional options
|
45
|
+
**kwargs: Additional options including:
|
46
|
+
mask_threshold: For both template and phash methods, pixels >= this value are masked.
|
47
|
+
For template matching: pixels are ignored in matching (e.g., 0.95)
|
48
|
+
For phash: pixels are replaced with median before hashing (e.g., 0.95)
|
47
49
|
|
48
50
|
Returns:
|
49
51
|
MatchResults collection
|
@@ -55,15 +57,25 @@ class VisualSearchMixin:
|
|
55
57
|
if not isinstance(examples, list):
|
56
58
|
examples = [examples]
|
57
59
|
|
60
|
+
from .similarity import VisualMatcher, compute_phash
|
61
|
+
|
58
62
|
# Initialize matcher with specified hash size
|
59
63
|
matcher = VisualMatcher(hash_size=hash_size)
|
60
64
|
|
61
65
|
# Prepare templates
|
62
66
|
templates = []
|
67
|
+
# Extract mask_threshold from kwargs for phash
|
68
|
+
mask_threshold = kwargs.get("mask_threshold")
|
69
|
+
mask_threshold_255 = (
|
70
|
+
int(mask_threshold * 255) if mask_threshold is not None and method == "phash" else None
|
71
|
+
)
|
72
|
+
|
63
73
|
for example in examples:
|
64
74
|
# Render the example region/element
|
65
75
|
example_image = example.render(resolution=resolution, crop=True)
|
66
|
-
template_hash = compute_phash(
|
76
|
+
template_hash = compute_phash(
|
77
|
+
example_image, hash_size=hash_size, mask_threshold=mask_threshold_255
|
78
|
+
)
|
67
79
|
templates.append({"image": example_image, "hash": template_hash, "source": example})
|
68
80
|
|
69
81
|
# Get pages to search based on the object type
|
@@ -76,6 +88,8 @@ class VisualSearchMixin:
|
|
76
88
|
pages_to_search = self.pages
|
77
89
|
elif hasattr(self, "number"): # Single page
|
78
90
|
pages_to_search = [self]
|
91
|
+
elif hasattr(self, "page") and hasattr(self, "bbox"): # Region
|
92
|
+
pages_to_search = [self]
|
79
93
|
else:
|
80
94
|
raise TypeError(f"Cannot search in {type(self)}")
|
81
95
|
|
@@ -86,10 +100,16 @@ class VisualSearchMixin:
|
|
86
100
|
scales = matcher._get_search_scales(sizes)
|
87
101
|
|
88
102
|
# Pre-calculate for all pages and templates
|
89
|
-
for
|
90
|
-
# Estimate
|
91
|
-
|
92
|
-
|
103
|
+
for search_obj in pages_to_search:
|
104
|
+
# Estimate image size based on object type
|
105
|
+
if hasattr(search_obj, "page") and hasattr(search_obj, "bbox"):
|
106
|
+
# Region
|
107
|
+
page_w = int(search_obj.width * resolution / 72.0)
|
108
|
+
page_h = int(search_obj.height * resolution / 72.0)
|
109
|
+
else:
|
110
|
+
# Page
|
111
|
+
page_w = int(search_obj.width * resolution / 72.0)
|
112
|
+
page_h = int(search_obj.height * resolution / 72.0)
|
93
113
|
|
94
114
|
for template_data in templates:
|
95
115
|
template_w, template_h = template_data["image"].size
|
@@ -99,11 +119,15 @@ class VisualSearchMixin:
|
|
99
119
|
scaled_h = int(template_h * scale)
|
100
120
|
|
101
121
|
if scaled_w <= page_w and scaled_h <= page_h:
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
122
|
+
# Determine step size
|
123
|
+
if step is not None:
|
124
|
+
actual_step = step
|
125
|
+
else:
|
126
|
+
# Default to 10% of template size
|
127
|
+
actual_step = max(1, int(min(scaled_w, scaled_h) * 0.1))
|
128
|
+
|
129
|
+
x_windows = len(range(0, page_w - scaled_w + 1, actual_step))
|
130
|
+
y_windows = len(range(0, page_h - scaled_h + 1, actual_step))
|
107
131
|
total_operations += x_windows * y_windows
|
108
132
|
|
109
133
|
# Search each page
|
@@ -124,9 +148,20 @@ class VisualSearchMixin:
|
|
124
148
|
mininterval=0.1, # Minimum time between updates (seconds)
|
125
149
|
)
|
126
150
|
|
127
|
-
for page_idx,
|
128
|
-
#
|
129
|
-
|
151
|
+
for page_idx, search_obj in enumerate(pages_to_search):
|
152
|
+
# Determine if we're searching in a page or a region
|
153
|
+
if hasattr(search_obj, "page") and hasattr(search_obj, "bbox"):
|
154
|
+
# This is a Region - render only the region area
|
155
|
+
region = search_obj
|
156
|
+
page = region.page
|
157
|
+
page_image = region.render(resolution=resolution, crop=True)
|
158
|
+
# Region offset for coordinate conversion
|
159
|
+
region_x0, region_y0 = region.x0, region.top
|
160
|
+
else:
|
161
|
+
# This is a Page - render the full page
|
162
|
+
page = search_obj
|
163
|
+
page_image = page.render(resolution=resolution)
|
164
|
+
region_x0, region_y0 = 0, 0
|
130
165
|
|
131
166
|
# Convert page coordinates to image coordinates
|
132
167
|
scale = resolution / 72.0 # PDF is 72 DPI
|
@@ -168,7 +203,8 @@ class VisualSearchMixin:
|
|
168
203
|
template_hash=template_hash,
|
169
204
|
confidence_threshold=confidence,
|
170
205
|
sizes=sizes,
|
171
|
-
|
206
|
+
step=step,
|
207
|
+
method=method,
|
172
208
|
show_progress=False, # We handle progress ourselves
|
173
209
|
progress_callback=update_progress if progress_bar else None,
|
174
210
|
**kwargs,
|
@@ -180,10 +216,12 @@ class VisualSearchMixin:
|
|
180
216
|
|
181
217
|
# Convert from image pixels to PDF points
|
182
218
|
# No flipping needed! PDF coordinates map directly to PIL coordinates
|
183
|
-
pdf_x0 = img_x0 / scale
|
184
|
-
pdf_y0 = img_y0 / scale
|
185
|
-
pdf_x1 = img_x1 / scale
|
186
|
-
pdf_y1 = img_y1 / scale
|
219
|
+
pdf_x0 = img_x0 / scale + region_x0
|
220
|
+
pdf_y0 = img_y0 / scale + region_y0
|
221
|
+
pdf_x1 = img_x1 / scale + region_x0
|
222
|
+
pdf_y1 = img_y1 / scale + region_y0
|
223
|
+
|
224
|
+
from .results import Match
|
187
225
|
|
188
226
|
# Create Match object
|
189
227
|
match = Match(
|
@@ -206,4 +244,6 @@ class VisualSearchMixin:
|
|
206
244
|
if progress_bar:
|
207
245
|
progress_bar.close()
|
208
246
|
|
247
|
+
from .results import MatchResults
|
248
|
+
|
209
249
|
return MatchResults(all_matches)
|
natural_pdf/vision/results.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple
|
4
4
|
|
5
|
-
# Import Region directly as it's a base class
|
6
5
|
from natural_pdf.elements.region import Region
|
7
6
|
|
8
7
|
if TYPE_CHECKING:
|
@@ -39,16 +38,41 @@ class Match(Region):
|
|
39
38
|
|
40
39
|
|
41
40
|
class MatchResults:
|
42
|
-
"""
|
41
|
+
"""
|
42
|
+
Collection of Match objects with transformation methods.
|
43
|
+
|
44
|
+
Matches are automatically sorted by confidence (highest first), so:
|
45
|
+
- matches[0] is the best match
|
46
|
+
- Iteration yields matches from best to worst
|
47
|
+
- The .top(n) method returns the n best matches
|
48
|
+
|
49
|
+
Example:
|
50
|
+
>>> matches = page.find_similar(logo_region)
|
51
|
+
>>> print(f"Found {len(matches)} matches")
|
52
|
+
>>>
|
53
|
+
>>> # Best match
|
54
|
+
>>> best = matches[0]
|
55
|
+
>>> print(f"Best match confidence: {best.confidence:.3f}")
|
56
|
+
>>>
|
57
|
+
>>> # Top 5 matches
|
58
|
+
>>> for match in matches.top(5):
|
59
|
+
... print(f"Confidence: {match.confidence:.3f} at page {match.page.number}")
|
60
|
+
>>>
|
61
|
+
>>> # All matches above 90% confidence
|
62
|
+
>>> high_conf = matches.filter_by_confidence(0.9)
|
63
|
+
"""
|
43
64
|
|
44
65
|
def __init__(self, matches: List[Match]):
|
45
|
-
"""Initialize with list of Match objects"""
|
66
|
+
"""Initialize with list of Match objects, automatically sorted by confidence"""
|
46
67
|
# Import here to avoid circular import
|
47
68
|
from natural_pdf.elements.element_collection import ElementCollection
|
48
69
|
|
70
|
+
# Sort matches by confidence (highest first)
|
71
|
+
sorted_matches = sorted(matches, key=lambda m: m.confidence, reverse=True)
|
72
|
+
|
49
73
|
# Create a base ElementCollection
|
50
|
-
self._collection = ElementCollection(
|
51
|
-
self._matches =
|
74
|
+
self._collection = ElementCollection(sorted_matches)
|
75
|
+
self._matches = sorted_matches
|
52
76
|
|
53
77
|
def __len__(self):
|
54
78
|
return len(self._matches)
|
@@ -68,6 +92,26 @@ class MatchResults:
|
|
68
92
|
"""Filter matches by minimum confidence"""
|
69
93
|
return self.filter(lambda m: m.confidence >= min_confidence)
|
70
94
|
|
95
|
+
def top(self, n: int) -> "MatchResults":
|
96
|
+
"""
|
97
|
+
Get the top N matches with highest confidence.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
n: Number of top matches to return
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
New MatchResults with only the top N matches
|
104
|
+
|
105
|
+
Example:
|
106
|
+
>>> matches = page.find_similar(logo)
|
107
|
+
>>> best_5 = matches.top(5)
|
108
|
+
>>> for match in best_5:
|
109
|
+
... print(f"Confidence: {match.confidence:.3f}")
|
110
|
+
"""
|
111
|
+
# Since matches are already sorted by confidence, just take first n
|
112
|
+
top_matches = self._matches[:n]
|
113
|
+
return MatchResults(top_matches)
|
114
|
+
|
71
115
|
def pages(self):
|
72
116
|
"""Get unique pages containing matches"""
|
73
117
|
# Import here to avoid circular import
|