natural-pdf 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/collections/mixins.py +16 -3
- natural_pdf/core/highlighting_service.py +25 -1
- natural_pdf/core/page.py +5 -3
- natural_pdf/core/page_collection.py +14 -14
- natural_pdf/core/pdf.py +4 -1
- natural_pdf/core/pdf_collection.py +131 -4
- natural_pdf/core/render_spec.py +46 -2
- natural_pdf/elements/base.py +66 -28
- natural_pdf/elements/element_collection.py +10 -10
- natural_pdf/elements/region.py +29 -27
- natural_pdf/vision/__init__.py +7 -0
- natural_pdf/vision/mixin.py +209 -0
- natural_pdf/vision/results.py +146 -0
- natural_pdf/vision/similarity.py +321 -0
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.4.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.4.dist-info}/RECORD +20 -16
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.4.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.4.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.4.dist-info}/top_level.txt +0 -0
@@ -1673,9 +1673,9 @@ class ElementCollection(
|
|
1673
1673
|
|
1674
1674
|
Args:
|
1675
1675
|
selector: CSS-like selector string
|
1676
|
-
|
1677
|
-
'
|
1678
|
-
(default: "
|
1676
|
+
overlap: How to determine if elements overlap: 'full' (fully inside),
|
1677
|
+
'partial' (any overlap), or 'center' (center point inside).
|
1678
|
+
(default: "full")
|
1679
1679
|
apply_exclusions: Whether to exclude elements in exclusion regions
|
1680
1680
|
"""
|
1681
1681
|
return self.apply(lambda element: element.find(selector, **kwargs))
|
@@ -1685,7 +1685,7 @@ class ElementCollection(
|
|
1685
1685
|
self,
|
1686
1686
|
*,
|
1687
1687
|
text: str,
|
1688
|
-
|
1688
|
+
overlap: str = "full",
|
1689
1689
|
apply_exclusions: bool = True,
|
1690
1690
|
regex: bool = False,
|
1691
1691
|
case: bool = True,
|
@@ -1697,7 +1697,7 @@ class ElementCollection(
|
|
1697
1697
|
self,
|
1698
1698
|
selector: str,
|
1699
1699
|
*,
|
1700
|
-
|
1700
|
+
overlap: str = "full",
|
1701
1701
|
apply_exclusions: bool = True,
|
1702
1702
|
regex: bool = False,
|
1703
1703
|
case: bool = True,
|
@@ -1709,7 +1709,7 @@ class ElementCollection(
|
|
1709
1709
|
selector: Optional[str] = None,
|
1710
1710
|
*,
|
1711
1711
|
text: Optional[str] = None,
|
1712
|
-
|
1712
|
+
overlap: str = "full",
|
1713
1713
|
apply_exclusions: bool = True,
|
1714
1714
|
regex: bool = False,
|
1715
1715
|
case: bool = True,
|
@@ -1724,9 +1724,9 @@ class ElementCollection(
|
|
1724
1724
|
Args:
|
1725
1725
|
selector: CSS-like selector string.
|
1726
1726
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1727
|
-
|
1728
|
-
'
|
1729
|
-
(default: "
|
1727
|
+
overlap: How to determine if elements overlap: 'full' (fully inside),
|
1728
|
+
'partial' (any overlap), or 'center' (center point inside).
|
1729
|
+
(default: "full")
|
1730
1730
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1731
1731
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1732
1732
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1748,7 +1748,7 @@ class ElementCollection(
|
|
1748
1748
|
found_in_element: "ElementCollection" = element.find_all(
|
1749
1749
|
selector=selector,
|
1750
1750
|
text=text,
|
1751
|
-
|
1751
|
+
overlap=overlap,
|
1752
1752
|
apply_exclusions=apply_exclusions,
|
1753
1753
|
regex=regex,
|
1754
1754
|
case=case,
|
natural_pdf/elements/region.py
CHANGED
@@ -221,7 +221,7 @@ class Region(
|
|
221
221
|
self,
|
222
222
|
mode: Literal["show", "render"] = "show",
|
223
223
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
224
|
-
highlights: Optional[List[Dict[str, Any]]] = None,
|
224
|
+
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
225
225
|
crop: Union[bool, Literal["content"]] = True, # Default to True for regions
|
226
226
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
227
227
|
**kwargs,
|
@@ -231,7 +231,7 @@ class Region(
|
|
231
231
|
Args:
|
232
232
|
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
233
233
|
color: Color for highlighting this region in show mode
|
234
|
-
highlights: Additional highlight groups to show
|
234
|
+
highlights: Additional highlight groups to show, or False to disable all highlights
|
235
235
|
crop: Whether to crop to this region
|
236
236
|
crop_bbox: Explicit crop bounds (overrides region bounds)
|
237
237
|
**kwargs: Additional parameters
|
@@ -250,10 +250,12 @@ class Region(
|
|
250
250
|
# Crop to this region's bounds
|
251
251
|
spec.crop_bbox = self.bbox
|
252
252
|
|
253
|
-
# Add highlights in show mode
|
254
|
-
if mode == "show":
|
255
|
-
#
|
256
|
-
|
253
|
+
# Add highlights in show mode (unless explicitly disabled with highlights=False)
|
254
|
+
if mode == "show" and highlights is not False:
|
255
|
+
# Only highlight this region if:
|
256
|
+
# 1. We're not cropping, OR
|
257
|
+
# 2. We're cropping but color was explicitly specified
|
258
|
+
if not crop or color is not None:
|
257
259
|
spec.add_highlight(
|
258
260
|
bbox=self.bbox,
|
259
261
|
polygon=self.polygon if self.has_polygon else None,
|
@@ -261,8 +263,8 @@ class Region(
|
|
261
263
|
label=self.label or self.name or "Region",
|
262
264
|
)
|
263
265
|
|
264
|
-
# Add additional highlight groups if provided
|
265
|
-
if highlights:
|
266
|
+
# Add additional highlight groups if provided (and highlights is a list)
|
267
|
+
if highlights and isinstance(highlights, list):
|
266
268
|
for group in highlights:
|
267
269
|
elements = group.get("elements", [])
|
268
270
|
group_color = group.get("color", color)
|
@@ -1982,7 +1984,7 @@ class Region(
|
|
1982
1984
|
self,
|
1983
1985
|
*,
|
1984
1986
|
text: str,
|
1985
|
-
|
1987
|
+
overlap: str = "full",
|
1986
1988
|
apply_exclusions: bool = True,
|
1987
1989
|
regex: bool = False,
|
1988
1990
|
case: bool = True,
|
@@ -1994,7 +1996,7 @@ class Region(
|
|
1994
1996
|
self,
|
1995
1997
|
selector: str,
|
1996
1998
|
*,
|
1997
|
-
|
1999
|
+
overlap: str = "full",
|
1998
2000
|
apply_exclusions: bool = True,
|
1999
2001
|
regex: bool = False,
|
2000
2002
|
case: bool = True,
|
@@ -2006,7 +2008,7 @@ class Region(
|
|
2006
2008
|
selector: Optional[str] = None, # Now optional
|
2007
2009
|
*,
|
2008
2010
|
text: Optional[str] = None, # New text parameter
|
2009
|
-
|
2011
|
+
overlap: str = "full", # How elements overlap with the region
|
2010
2012
|
apply_exclusions: bool = True,
|
2011
2013
|
regex: bool = False,
|
2012
2014
|
case: bool = True,
|
@@ -2020,9 +2022,9 @@ class Region(
|
|
2020
2022
|
Args:
|
2021
2023
|
selector: CSS-like selector string.
|
2022
2024
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
2023
|
-
|
2024
|
-
'
|
2025
|
-
(default: "
|
2025
|
+
overlap: How to determine if elements overlap with the region: 'full' (fully inside),
|
2026
|
+
'partial' (any overlap), or 'center' (center point inside).
|
2027
|
+
(default: "full")
|
2026
2028
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
2027
2029
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
2028
2030
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -2035,7 +2037,7 @@ class Region(
|
|
2035
2037
|
elements = self.find_all(
|
2036
2038
|
selector=selector,
|
2037
2039
|
text=text,
|
2038
|
-
|
2040
|
+
overlap=overlap,
|
2039
2041
|
apply_exclusions=apply_exclusions,
|
2040
2042
|
regex=regex,
|
2041
2043
|
case=case,
|
@@ -2048,7 +2050,7 @@ class Region(
|
|
2048
2050
|
self,
|
2049
2051
|
*,
|
2050
2052
|
text: str,
|
2051
|
-
|
2053
|
+
overlap: str = "full",
|
2052
2054
|
apply_exclusions: bool = True,
|
2053
2055
|
regex: bool = False,
|
2054
2056
|
case: bool = True,
|
@@ -2060,7 +2062,7 @@ class Region(
|
|
2060
2062
|
self,
|
2061
2063
|
selector: str,
|
2062
2064
|
*,
|
2063
|
-
|
2065
|
+
overlap: str = "full",
|
2064
2066
|
apply_exclusions: bool = True,
|
2065
2067
|
regex: bool = False,
|
2066
2068
|
case: bool = True,
|
@@ -2072,7 +2074,7 @@ class Region(
|
|
2072
2074
|
selector: Optional[str] = None, # Now optional
|
2073
2075
|
*,
|
2074
2076
|
text: Optional[str] = None, # New text parameter
|
2075
|
-
|
2077
|
+
overlap: str = "full", # How elements overlap with the region
|
2076
2078
|
apply_exclusions: bool = True,
|
2077
2079
|
regex: bool = False,
|
2078
2080
|
case: bool = True,
|
@@ -2086,9 +2088,9 @@ class Region(
|
|
2086
2088
|
Args:
|
2087
2089
|
selector: CSS-like selector string.
|
2088
2090
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
2089
|
-
|
2090
|
-
'
|
2091
|
-
(default: "
|
2091
|
+
overlap: How to determine if elements overlap with the region: 'full' (fully inside),
|
2092
|
+
'partial' (any overlap), or 'center' (center point inside).
|
2093
|
+
(default: "full")
|
2092
2094
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
2093
2095
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
2094
2096
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -2104,10 +2106,10 @@ class Region(
|
|
2104
2106
|
if selector is None and text is None:
|
2105
2107
|
raise ValueError("Provide either 'selector' or 'text'.")
|
2106
2108
|
|
2107
|
-
# Validate
|
2108
|
-
if
|
2109
|
+
# Validate overlap parameter
|
2110
|
+
if overlap not in ["full", "partial", "center"]:
|
2109
2111
|
raise ValueError(
|
2110
|
-
f"Invalid
|
2112
|
+
f"Invalid overlap value: {overlap}. Must be 'full', 'partial', or 'center'"
|
2111
2113
|
)
|
2112
2114
|
|
2113
2115
|
# Construct selector if 'text' is provided
|
@@ -2142,7 +2144,7 @@ class Region(
|
|
2142
2144
|
region_bbox = self.bbox
|
2143
2145
|
matching_elements = []
|
2144
2146
|
|
2145
|
-
if
|
2147
|
+
if overlap == "full": # Fully inside (strict)
|
2146
2148
|
matching_elements = [
|
2147
2149
|
el
|
2148
2150
|
for el in potential_elements
|
@@ -2151,9 +2153,9 @@ class Region(
|
|
2151
2153
|
and el.x1 <= region_bbox[2]
|
2152
2154
|
and el.bottom <= region_bbox[3]
|
2153
2155
|
]
|
2154
|
-
elif
|
2156
|
+
elif overlap == "partial": # Any overlap
|
2155
2157
|
matching_elements = [el for el in potential_elements if self.intersects(el)]
|
2156
|
-
elif
|
2158
|
+
elif overlap == "center": # Center point inside
|
2157
2159
|
matching_elements = [
|
2158
2160
|
el for el in potential_elements if self.is_element_center_inside(el)
|
2159
2161
|
]
|
@@ -0,0 +1,7 @@
|
|
1
|
+
"""Vision module for visual similarity and pattern matching"""
|
2
|
+
|
3
|
+
from .mixin import VisualSearchMixin
|
4
|
+
from .results import Match, MatchResults
|
5
|
+
from .similarity import VisualMatcher, compute_phash
|
6
|
+
|
7
|
+
__all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]
|
@@ -0,0 +1,209 @@
|
|
1
|
+
"""Mixin to add visual similarity search to Page/PDF/PDFCollection"""
|
2
|
+
|
3
|
+
from typing import List, Optional, Tuple, Union
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
from PIL import Image
|
7
|
+
from tqdm.auto import tqdm
|
8
|
+
|
9
|
+
from .results import Match, MatchResults
|
10
|
+
from .similarity import VisualMatcher, compute_phash
|
11
|
+
|
12
|
+
|
13
|
+
class VisualSearchMixin:
|
14
|
+
"""Add find_similar method to classes that include this mixin"""
|
15
|
+
|
16
|
+
def find_similar(
|
17
|
+
self,
|
18
|
+
examples: Union["Element", "Region", List[Union["Element", "Region"]]],
|
19
|
+
using: str = "vision",
|
20
|
+
confidence: float = 0.6,
|
21
|
+
sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
|
22
|
+
resolution: int = 72,
|
23
|
+
hash_size: int = 20,
|
24
|
+
step_factor: float = 0.1,
|
25
|
+
max_per_page: Optional[int] = None,
|
26
|
+
show_progress: bool = True,
|
27
|
+
**kwargs,
|
28
|
+
) -> MatchResults:
|
29
|
+
"""
|
30
|
+
Find regions visually similar to the given example(s).
|
31
|
+
|
32
|
+
Args:
|
33
|
+
examples: Single element/region or list of examples to search for
|
34
|
+
using: Search method - currently only 'vision' is supported
|
35
|
+
confidence: Minimum similarity score (0-1)
|
36
|
+
sizes: Size variations to search. Can be:
|
37
|
+
- float: ±percentage (e.g., 0.2 = 80%-120%)
|
38
|
+
- tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
|
39
|
+
- tuple(min, max, step): explicit step size
|
40
|
+
- list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
|
41
|
+
resolution: Resolution for image comparison (DPI) (default: 72)
|
42
|
+
hash_size: Size of perceptual hash grid (default: 12)
|
43
|
+
step_factor: Step size as fraction of template size (default: 0.1)
|
44
|
+
max_per_page: Maximum matches to return per page
|
45
|
+
show_progress: Show progress bar for multi-page searches (default: True)
|
46
|
+
**kwargs: Additional options
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
MatchResults collection
|
50
|
+
"""
|
51
|
+
if using != "vision":
|
52
|
+
raise NotImplementedError(f"using='{using}' not yet supported")
|
53
|
+
|
54
|
+
# Ensure examples is a list
|
55
|
+
if not isinstance(examples, list):
|
56
|
+
examples = [examples]
|
57
|
+
|
58
|
+
# Initialize matcher with specified hash size
|
59
|
+
matcher = VisualMatcher(hash_size=hash_size)
|
60
|
+
|
61
|
+
# Prepare templates
|
62
|
+
templates = []
|
63
|
+
for example in examples:
|
64
|
+
# Render the example region/element
|
65
|
+
example_image = example.render(resolution=resolution, crop=True)
|
66
|
+
template_hash = compute_phash(example_image, hash_size=hash_size)
|
67
|
+
templates.append({"image": example_image, "hash": template_hash, "source": example})
|
68
|
+
|
69
|
+
# Get pages to search based on the object type
|
70
|
+
if hasattr(self, "__class__") and self.__class__.__name__ == "PDFCollection":
|
71
|
+
# PDFCollection needs to iterate through all PDFs
|
72
|
+
pages_to_search = []
|
73
|
+
for pdf in self:
|
74
|
+
pages_to_search.extend(pdf.pages)
|
75
|
+
elif hasattr(self, "pages"): # PDF
|
76
|
+
pages_to_search = self.pages
|
77
|
+
elif hasattr(self, "number"): # Single page
|
78
|
+
pages_to_search = [self]
|
79
|
+
else:
|
80
|
+
raise TypeError(f"Cannot search in {type(self)}")
|
81
|
+
|
82
|
+
# Calculate total operations for progress bar
|
83
|
+
total_operations = 0
|
84
|
+
if show_progress:
|
85
|
+
# Get scales that will be searched
|
86
|
+
scales = matcher._get_search_scales(sizes)
|
87
|
+
|
88
|
+
# Pre-calculate for all pages and templates
|
89
|
+
for page in pages_to_search:
|
90
|
+
# Estimate page image size
|
91
|
+
page_w = int(page.width * resolution / 72.0)
|
92
|
+
page_h = int(page.height * resolution / 72.0)
|
93
|
+
|
94
|
+
for template_data in templates:
|
95
|
+
template_w, template_h = template_data["image"].size
|
96
|
+
|
97
|
+
for scale in scales:
|
98
|
+
scaled_w = int(template_w * scale)
|
99
|
+
scaled_h = int(template_h * scale)
|
100
|
+
|
101
|
+
if scaled_w <= page_w and scaled_h <= page_h:
|
102
|
+
step_x = max(1, int(scaled_w * step_factor))
|
103
|
+
step_y = max(1, int(scaled_h * step_factor))
|
104
|
+
|
105
|
+
x_windows = len(range(0, page_w - scaled_w + 1, step_x))
|
106
|
+
y_windows = len(range(0, page_h - scaled_h + 1, step_y))
|
107
|
+
total_operations += x_windows * y_windows
|
108
|
+
|
109
|
+
# Search each page
|
110
|
+
all_matches = []
|
111
|
+
|
112
|
+
# Create single progress bar for all operations
|
113
|
+
progress_bar = None
|
114
|
+
operations_done = 0
|
115
|
+
last_update = 0
|
116
|
+
update_frequency = max(1, total_operations // 1000) # Update at most 1000 times
|
117
|
+
|
118
|
+
if show_progress and total_operations > 0:
|
119
|
+
progress_bar = tqdm(
|
120
|
+
total=total_operations,
|
121
|
+
desc="Searching",
|
122
|
+
unit="window",
|
123
|
+
miniters=update_frequency, # Minimum iterations between updates
|
124
|
+
mininterval=0.1, # Minimum time between updates (seconds)
|
125
|
+
)
|
126
|
+
|
127
|
+
for page_idx, page in enumerate(pages_to_search):
|
128
|
+
# Render the full page once
|
129
|
+
page_image = page.render(resolution=resolution)
|
130
|
+
|
131
|
+
# Convert page coordinates to image coordinates
|
132
|
+
scale = resolution / 72.0 # PDF is 72 DPI
|
133
|
+
|
134
|
+
page_matches = []
|
135
|
+
|
136
|
+
# Search for each template
|
137
|
+
for template_idx, template_data in enumerate(templates):
|
138
|
+
template_image = template_data["image"]
|
139
|
+
template_hash = template_data["hash"]
|
140
|
+
|
141
|
+
# Custom progress callback to update our main progress bar
|
142
|
+
def update_progress():
|
143
|
+
nonlocal operations_done, last_update
|
144
|
+
operations_done += 1
|
145
|
+
|
146
|
+
# Only update progress bar every N operations to avoid overwhelming output
|
147
|
+
if progress_bar and (
|
148
|
+
operations_done - last_update >= update_frequency
|
149
|
+
or operations_done == total_operations
|
150
|
+
):
|
151
|
+
progress_bar.update(operations_done - last_update)
|
152
|
+
last_update = operations_done
|
153
|
+
|
154
|
+
# Update description with current page/template info
|
155
|
+
if len(pages_to_search) > 1:
|
156
|
+
progress_bar.set_description(
|
157
|
+
f"Page {page.number}/{len(pages_to_search)}"
|
158
|
+
)
|
159
|
+
elif len(templates) > 1:
|
160
|
+
progress_bar.set_description(
|
161
|
+
f"Template {template_idx + 1}/{len(templates)}"
|
162
|
+
)
|
163
|
+
|
164
|
+
# Find matches in this page - never show internal progress
|
165
|
+
candidates = matcher.find_matches_in_image(
|
166
|
+
template_image,
|
167
|
+
page_image,
|
168
|
+
template_hash=template_hash,
|
169
|
+
confidence_threshold=confidence,
|
170
|
+
sizes=sizes,
|
171
|
+
step_factor=step_factor,
|
172
|
+
show_progress=False, # We handle progress ourselves
|
173
|
+
progress_callback=update_progress if progress_bar else None,
|
174
|
+
**kwargs,
|
175
|
+
)
|
176
|
+
|
177
|
+
# Convert image coordinates back to PDF coordinates
|
178
|
+
for candidate in candidates:
|
179
|
+
img_x0, img_y0, img_x1, img_y1 = candidate.bbox
|
180
|
+
|
181
|
+
# Convert from image pixels to PDF points
|
182
|
+
# No flipping needed! PDF coordinates map directly to PIL coordinates
|
183
|
+
pdf_x0 = img_x0 / scale
|
184
|
+
pdf_y0 = img_y0 / scale
|
185
|
+
pdf_x1 = img_x1 / scale
|
186
|
+
pdf_y1 = img_y1 / scale
|
187
|
+
|
188
|
+
# Create Match object
|
189
|
+
match = Match(
|
190
|
+
page=page,
|
191
|
+
bbox=(pdf_x0, pdf_y0, pdf_x1, pdf_y1),
|
192
|
+
confidence=candidate.confidence,
|
193
|
+
source_example=template_data["source"],
|
194
|
+
)
|
195
|
+
page_matches.append(match)
|
196
|
+
|
197
|
+
# Apply max_per_page limit if specified
|
198
|
+
if max_per_page and len(page_matches) > max_per_page:
|
199
|
+
# Sort by confidence and take top N
|
200
|
+
page_matches.sort(key=lambda m: m.confidence, reverse=True)
|
201
|
+
page_matches = page_matches[:max_per_page]
|
202
|
+
|
203
|
+
all_matches.extend(page_matches)
|
204
|
+
|
205
|
+
# Close progress bar
|
206
|
+
if progress_bar:
|
207
|
+
progress_bar.close()
|
208
|
+
|
209
|
+
return MatchResults(all_matches)
|
@@ -0,0 +1,146 @@
|
|
1
|
+
"""Match results for visual similarity search"""
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple
|
4
|
+
|
5
|
+
# Import Region directly as it's a base class
|
6
|
+
from natural_pdf.elements.region import Region
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from natural_pdf.core.page_collection import PageCollection
|
10
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
11
|
+
|
12
|
+
|
13
|
+
class Match(Region):
|
14
|
+
"""A region that was found via visual similarity search"""
|
15
|
+
|
16
|
+
def __init__(self, page, bbox, confidence, source_example=None, metadata=None):
|
17
|
+
"""
|
18
|
+
Initialize a Match object.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
page: Page containing the match
|
22
|
+
bbox: Bounding box of the match
|
23
|
+
confidence: Similarity confidence (0-1)
|
24
|
+
source_example: The example/template that led to this match
|
25
|
+
metadata: Additional metadata about the match
|
26
|
+
"""
|
27
|
+
super().__init__(page, bbox)
|
28
|
+
self.confidence = confidence
|
29
|
+
self.source_example = source_example
|
30
|
+
self.metadata = metadata or {}
|
31
|
+
|
32
|
+
@property
|
33
|
+
def pdf(self):
|
34
|
+
"""Get the PDF containing this match"""
|
35
|
+
return self.page.pdf
|
36
|
+
|
37
|
+
def __repr__(self):
|
38
|
+
return f"<Match page={self.page.number} confidence={self.confidence:.2f} bbox={self.bbox}>"
|
39
|
+
|
40
|
+
|
41
|
+
class MatchResults:
|
42
|
+
"""Collection of Match objects with transformation methods"""
|
43
|
+
|
44
|
+
def __init__(self, matches: List[Match]):
|
45
|
+
"""Initialize with list of Match objects"""
|
46
|
+
# Import here to avoid circular import
|
47
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
48
|
+
|
49
|
+
# Create a base ElementCollection
|
50
|
+
self._collection = ElementCollection(matches)
|
51
|
+
self._matches = matches
|
52
|
+
|
53
|
+
def __len__(self):
|
54
|
+
return len(self._matches)
|
55
|
+
|
56
|
+
def __iter__(self):
|
57
|
+
return iter(self._matches)
|
58
|
+
|
59
|
+
def __getitem__(self, key):
|
60
|
+
return self._matches[key]
|
61
|
+
|
62
|
+
def filter(self, filter_func) -> "MatchResults":
|
63
|
+
"""Filter matches by a function"""
|
64
|
+
filtered = [m for m in self if filter_func(m)]
|
65
|
+
return MatchResults(filtered)
|
66
|
+
|
67
|
+
def filter_by_confidence(self, min_confidence: float) -> "MatchResults":
|
68
|
+
"""Filter matches by minimum confidence"""
|
69
|
+
return self.filter(lambda m: m.confidence >= min_confidence)
|
70
|
+
|
71
|
+
def pages(self):
|
72
|
+
"""Get unique pages containing matches"""
|
73
|
+
# Import here to avoid circular import
|
74
|
+
from natural_pdf.core.page_collection import PageCollection
|
75
|
+
|
76
|
+
# Get unique pages while preserving order
|
77
|
+
seen = set()
|
78
|
+
unique_pages = []
|
79
|
+
for match in self:
|
80
|
+
if match.page not in seen:
|
81
|
+
seen.add(match.page)
|
82
|
+
unique_pages.append(match.page)
|
83
|
+
|
84
|
+
# Attach matches to each page
|
85
|
+
for page in unique_pages:
|
86
|
+
page._matches = MatchResults([m for m in self if m.page == page])
|
87
|
+
|
88
|
+
return PageCollection(unique_pages)
|
89
|
+
|
90
|
+
def pdfs(self):
|
91
|
+
"""Get unique PDFs containing matches"""
|
92
|
+
# Import here to avoid circular import
|
93
|
+
from natural_pdf.core.pdf_collection import PDFCollection
|
94
|
+
|
95
|
+
# Get unique PDFs while preserving order
|
96
|
+
seen = set()
|
97
|
+
unique_pdfs = []
|
98
|
+
for match in self:
|
99
|
+
if match.pdf not in seen:
|
100
|
+
seen.add(match.pdf)
|
101
|
+
unique_pdfs.append(match.pdf)
|
102
|
+
|
103
|
+
# Attach matches to each PDF
|
104
|
+
for pdf in unique_pdfs:
|
105
|
+
pdf._matches = MatchResults([m for m in self if m.pdf == pdf])
|
106
|
+
|
107
|
+
return PDFCollection(unique_pdfs)
|
108
|
+
|
109
|
+
def group_by_page(self) -> Iterator[Tuple[Any, "MatchResults"]]:
|
110
|
+
"""Group matches by page"""
|
111
|
+
from itertools import groupby
|
112
|
+
|
113
|
+
# Sort by PDF filename and page number
|
114
|
+
sorted_matches = sorted(self, key=lambda m: (getattr(m.pdf, "filename", ""), m.page.number))
|
115
|
+
|
116
|
+
for page, matches in groupby(sorted_matches, key=lambda m: m.page):
|
117
|
+
yield page, MatchResults(list(matches))
|
118
|
+
|
119
|
+
def sort_by_confidence(self, descending: bool = True) -> "MatchResults":
|
120
|
+
"""Sort matches by confidence score"""
|
121
|
+
sorted_matches = sorted(self, key=lambda m: m.confidence, reverse=descending)
|
122
|
+
return MatchResults(sorted_matches)
|
123
|
+
|
124
|
+
def regions(self):
|
125
|
+
"""Get all matches as an ElementCollection of regions"""
|
126
|
+
# Import here to avoid circular import
|
127
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
128
|
+
|
129
|
+
# Matches are already Region objects, so just wrap them
|
130
|
+
return ElementCollection(list(self))
|
131
|
+
|
132
|
+
def show(self, **kwargs):
|
133
|
+
"""Show all matches using ElementCollection.show()"""
|
134
|
+
# Get regions and show them
|
135
|
+
return self.regions().show(**kwargs)
|
136
|
+
|
137
|
+
def __repr__(self):
|
138
|
+
if len(self) == 0:
|
139
|
+
return "<MatchResults: empty>"
|
140
|
+
elif len(self) == 1:
|
141
|
+
return f"<MatchResults: 1 match>"
|
142
|
+
else:
|
143
|
+
conf_range = (
|
144
|
+
f"{min(m.confidence for m in self):.2f}-{max(m.confidence for m in self):.2f}"
|
145
|
+
)
|
146
|
+
return f"<MatchResults: {len(self)} matches, confidence {conf_range}>"
|