natural-pdf 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/page.py +2 -0
- natural_pdf/core/pdf.py +4 -1
- natural_pdf/core/pdf_collection.py +131 -4
- natural_pdf/core/render_spec.py +2 -2
- natural_pdf/elements/base.py +18 -14
- natural_pdf/elements/region.py +10 -8
- natural_pdf/vision/__init__.py +7 -0
- natural_pdf/vision/mixin.py +209 -0
- natural_pdf/vision/results.py +146 -0
- natural_pdf/vision/similarity.py +321 -0
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/RECORD +16 -12
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.4.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -78,6 +78,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
78
78
|
|
79
79
|
# # Import new utils
|
80
80
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
81
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
81
82
|
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
|
82
83
|
|
83
84
|
# --- End Classification Imports --- #
|
@@ -101,6 +102,7 @@ class Page(
|
|
101
102
|
ExtractionMixin,
|
102
103
|
ShapeDetectionMixin,
|
103
104
|
DescribeMixin,
|
105
|
+
VisualSearchMixin,
|
104
106
|
Visualizable,
|
105
107
|
):
|
106
108
|
"""Enhanced Page wrapper built on top of pdfplumber.Page.
|
natural_pdf/core/pdf.py
CHANGED
@@ -42,6 +42,7 @@ from natural_pdf.ocr import OCRManager, OCROptions
|
|
42
42
|
from natural_pdf.selectors.parser import parse_selector
|
43
43
|
from natural_pdf.text_mixin import TextMixin
|
44
44
|
from natural_pdf.utils.locks import pdf_render_lock
|
45
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
45
46
|
|
46
47
|
if TYPE_CHECKING:
|
47
48
|
from natural_pdf.elements.element_collection import ElementCollection
|
@@ -252,7 +253,9 @@ class _LazyPageList(Sequence):
|
|
252
253
|
# --- End Lazy Page List Helper --- #
|
253
254
|
|
254
255
|
|
255
|
-
class PDF(
|
256
|
+
class PDF(
|
257
|
+
TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, VisualSearchMixin, Visualizable
|
258
|
+
):
|
256
259
|
"""Enhanced PDF wrapper built on top of pdfplumber.
|
257
260
|
|
258
261
|
This class provides a fluent interface for working with PDF documents,
|
@@ -40,6 +40,7 @@ logger = logging.getLogger(__name__)
|
|
40
40
|
from natural_pdf.core.pdf import PDF
|
41
41
|
from natural_pdf.elements.region import Region
|
42
42
|
from natural_pdf.export.mixin import ExportMixin
|
43
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
43
44
|
|
44
45
|
# --- Search Imports ---
|
45
46
|
try:
|
@@ -69,8 +70,8 @@ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the ne
|
|
69
70
|
|
70
71
|
|
71
72
|
class PDFCollection(
|
72
|
-
SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin
|
73
|
-
):
|
73
|
+
SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin, VisualSearchMixin
|
74
|
+
):
|
74
75
|
def __init__(
|
75
76
|
self,
|
76
77
|
source: Union[str, Iterable[Union[str, "PDF"]]],
|
@@ -258,8 +259,6 @@ class PDFCollection(
|
|
258
259
|
return iter(self._pdfs)
|
259
260
|
|
260
261
|
def __repr__(self) -> str:
|
261
|
-
# Removed search status
|
262
|
-
return f"<PDFCollection(count={len(self._pdfs)})>"
|
263
262
|
return f"<PDFCollection(count={len(self._pdfs)})>"
|
264
263
|
|
265
264
|
@property
|
@@ -267,6 +266,134 @@ class PDFCollection(
|
|
267
266
|
"""Returns the list of PDF objects held by the collection."""
|
268
267
|
return self._pdfs
|
269
268
|
|
269
|
+
def show(self, limit: Optional[int] = 30, per_pdf_limit: Optional[int] = 10, **kwargs):
|
270
|
+
"""
|
271
|
+
Display all PDFs in the collection with labels.
|
272
|
+
|
273
|
+
Each PDF is shown with its pages in a grid layout (6 columns by default),
|
274
|
+
and all PDFs are stacked vertically with labels.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
limit: Maximum total pages to show across all PDFs (default: 30)
|
278
|
+
per_pdf_limit: Maximum pages to show per PDF (default: 10)
|
279
|
+
**kwargs: Additional arguments passed to each PDF's show() method
|
280
|
+
(e.g., columns, exclusions, resolution, etc.)
|
281
|
+
|
282
|
+
Returns:
|
283
|
+
Displayed image in Jupyter or None
|
284
|
+
"""
|
285
|
+
if not self._pdfs:
|
286
|
+
print("Empty collection")
|
287
|
+
return None
|
288
|
+
|
289
|
+
# Import here to avoid circular imports
|
290
|
+
import numpy as np
|
291
|
+
from PIL import Image, ImageDraw, ImageFont
|
292
|
+
|
293
|
+
# Calculate pages per PDF if total limit is set
|
294
|
+
if limit and not per_pdf_limit:
|
295
|
+
per_pdf_limit = max(1, limit // len(self._pdfs))
|
296
|
+
|
297
|
+
# Collect images from each PDF
|
298
|
+
all_images = []
|
299
|
+
total_pages_shown = 0
|
300
|
+
|
301
|
+
for pdf in self._pdfs:
|
302
|
+
if limit and total_pages_shown >= limit:
|
303
|
+
break
|
304
|
+
|
305
|
+
# Calculate limit for this PDF
|
306
|
+
pdf_limit = per_pdf_limit
|
307
|
+
if limit:
|
308
|
+
remaining = limit - total_pages_shown
|
309
|
+
pdf_limit = min(per_pdf_limit or remaining, remaining)
|
310
|
+
|
311
|
+
# Get PDF identifier
|
312
|
+
pdf_name = getattr(pdf, "filename", None) or getattr(pdf, "path", "Unknown")
|
313
|
+
if isinstance(pdf_name, Path):
|
314
|
+
pdf_name = pdf_name.name
|
315
|
+
elif "/" in str(pdf_name):
|
316
|
+
pdf_name = str(pdf_name).split("/")[-1]
|
317
|
+
|
318
|
+
# Render this PDF
|
319
|
+
try:
|
320
|
+
# Get render specs from the PDF
|
321
|
+
render_specs = pdf._get_render_specs(mode="show", max_pages=pdf_limit, **kwargs)
|
322
|
+
|
323
|
+
if not render_specs:
|
324
|
+
continue
|
325
|
+
|
326
|
+
# Get the highlighter and render without displaying
|
327
|
+
highlighter = pdf._get_highlighter()
|
328
|
+
pdf_image = highlighter.unified_render(
|
329
|
+
specs=render_specs,
|
330
|
+
layout="grid" if len(render_specs) > 1 else "single",
|
331
|
+
columns=6,
|
332
|
+
**kwargs,
|
333
|
+
)
|
334
|
+
|
335
|
+
if pdf_image:
|
336
|
+
# Add label above the PDF image
|
337
|
+
label_height = 40
|
338
|
+
label_bg_color = (240, 240, 240)
|
339
|
+
label_text_color = (0, 0, 0)
|
340
|
+
|
341
|
+
# Create new image with space for label
|
342
|
+
width, height = pdf_image.size
|
343
|
+
labeled_image = Image.new("RGB", (width, height + label_height), "white")
|
344
|
+
|
345
|
+
# Draw label background
|
346
|
+
draw = ImageDraw.Draw(labeled_image)
|
347
|
+
draw.rectangle([0, 0, width, label_height], fill=label_bg_color)
|
348
|
+
|
349
|
+
# Draw label text
|
350
|
+
try:
|
351
|
+
# Try to use a nice font if available
|
352
|
+
font = ImageFont.truetype("Arial", 20)
|
353
|
+
except:
|
354
|
+
# Fallback to default font
|
355
|
+
font = ImageFont.load_default()
|
356
|
+
|
357
|
+
label_text = f"{pdf_name} ({len(pdf.pages)} pages)"
|
358
|
+
draw.text((10, 10), label_text, fill=label_text_color, font=font)
|
359
|
+
|
360
|
+
# Paste PDF image below label
|
361
|
+
labeled_image.paste(pdf_image, (0, label_height))
|
362
|
+
|
363
|
+
all_images.append(labeled_image)
|
364
|
+
total_pages_shown += min(pdf_limit, len(pdf.pages))
|
365
|
+
|
366
|
+
except Exception as e:
|
367
|
+
logger.warning(f"Failed to render PDF {pdf_name}: {e}")
|
368
|
+
continue
|
369
|
+
|
370
|
+
if not all_images:
|
371
|
+
print("No PDFs could be rendered")
|
372
|
+
return None
|
373
|
+
|
374
|
+
# Combine all images vertically
|
375
|
+
if len(all_images) == 1:
|
376
|
+
combined = all_images[0]
|
377
|
+
else:
|
378
|
+
# Add spacing between PDFs
|
379
|
+
spacing = 20
|
380
|
+
total_height = sum(img.height for img in all_images) + spacing * (len(all_images) - 1)
|
381
|
+
max_width = max(img.width for img in all_images)
|
382
|
+
|
383
|
+
combined = Image.new("RGB", (max_width, total_height), "white")
|
384
|
+
|
385
|
+
y_offset = 0
|
386
|
+
for i, img in enumerate(all_images):
|
387
|
+
# Center images if they're narrower than max width
|
388
|
+
x_offset = (max_width - img.width) // 2
|
389
|
+
combined.paste(img, (x_offset, y_offset))
|
390
|
+
y_offset += img.height
|
391
|
+
if i < len(all_images) - 1:
|
392
|
+
y_offset += spacing
|
393
|
+
|
394
|
+
# Return the combined image (Jupyter will display it automatically)
|
395
|
+
return combined
|
396
|
+
|
270
397
|
@overload
|
271
398
|
def find_all(
|
272
399
|
self,
|
natural_pdf/core/render_spec.py
CHANGED
@@ -186,7 +186,7 @@ class Visualizable:
|
|
186
186
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
187
187
|
labels: bool = True,
|
188
188
|
label_format: Optional[str] = None,
|
189
|
-
highlights: Optional[List[Dict[str, Any]]] = None,
|
189
|
+
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
190
190
|
legend_position: str = "right",
|
191
191
|
annotate: Optional[Union[str, List[str]]] = None,
|
192
192
|
# Layout options for multi-page/region
|
@@ -211,7 +211,7 @@ class Visualizable:
|
|
211
211
|
color: Default highlight color
|
212
212
|
labels: Whether to show labels for highlights
|
213
213
|
label_format: Format string for labels (e.g., "Element {index}")
|
214
|
-
highlights: Additional highlight groups to show
|
214
|
+
highlights: Additional highlight groups to show, or False to disable all highlights
|
215
215
|
legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
|
216
216
|
annotate: Attribute name(s) to display on highlights (string or list)
|
217
217
|
layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)
|
natural_pdf/elements/base.py
CHANGED
@@ -1192,7 +1192,7 @@ class Element(
|
|
1192
1192
|
self,
|
1193
1193
|
mode: Literal["show", "render"] = "show",
|
1194
1194
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
1195
|
-
highlights: Optional[List[Dict[str, Any]]] = None,
|
1195
|
+
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
1196
1196
|
crop: Union[bool, Literal["content"]] = False,
|
1197
1197
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
1198
1198
|
label: Optional[str] = None,
|
@@ -1203,7 +1203,7 @@ class Element(
|
|
1203
1203
|
Args:
|
1204
1204
|
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
1205
1205
|
color: Color for highlighting this element in show mode
|
1206
|
-
highlights: Additional highlight groups to show
|
1206
|
+
highlights: Additional highlight groups to show, or False to disable all highlights
|
1207
1207
|
crop: Whether to crop to element bounds
|
1208
1208
|
crop_bbox: Explicit crop bounds
|
1209
1209
|
label: Optional label for this element
|
@@ -1225,19 +1225,23 @@ class Element(
|
|
1225
1225
|
if hasattr(self, "bbox") and self.bbox:
|
1226
1226
|
spec.crop_bbox = self.bbox
|
1227
1227
|
|
1228
|
-
# Add highlight in show mode
|
1229
|
-
if mode == "show":
|
1230
|
-
#
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1228
|
+
# Add highlight in show mode (unless explicitly disabled with highlights=False)
|
1229
|
+
if mode == "show" and highlights is not False:
|
1230
|
+
# Only highlight this element if:
|
1231
|
+
# 1. We're not cropping, OR
|
1232
|
+
# 2. We're cropping but color was explicitly specified
|
1233
|
+
if not crop or color is not None:
|
1234
|
+
# Use provided label or generate one
|
1235
|
+
element_label = label if label is not None else self.__class__.__name__
|
1236
|
+
|
1237
|
+
spec.add_highlight(
|
1238
|
+
element=self,
|
1239
|
+
color=color or "red", # Default red for single element
|
1240
|
+
label=element_label,
|
1241
|
+
)
|
1238
1242
|
|
1239
|
-
# Add additional highlight groups if provided
|
1240
|
-
if highlights:
|
1243
|
+
# Add additional highlight groups if provided (and highlights is a list)
|
1244
|
+
if highlights and isinstance(highlights, list):
|
1241
1245
|
for group in highlights:
|
1242
1246
|
group_elements = group.get("elements", [])
|
1243
1247
|
group_color = group.get("color", color)
|
natural_pdf/elements/region.py
CHANGED
@@ -221,7 +221,7 @@ class Region(
|
|
221
221
|
self,
|
222
222
|
mode: Literal["show", "render"] = "show",
|
223
223
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
224
|
-
highlights: Optional[List[Dict[str, Any]]] = None,
|
224
|
+
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
225
225
|
crop: Union[bool, Literal["content"]] = True, # Default to True for regions
|
226
226
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
227
227
|
**kwargs,
|
@@ -231,7 +231,7 @@ class Region(
|
|
231
231
|
Args:
|
232
232
|
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
233
233
|
color: Color for highlighting this region in show mode
|
234
|
-
highlights: Additional highlight groups to show
|
234
|
+
highlights: Additional highlight groups to show, or False to disable all highlights
|
235
235
|
crop: Whether to crop to this region
|
236
236
|
crop_bbox: Explicit crop bounds (overrides region bounds)
|
237
237
|
**kwargs: Additional parameters
|
@@ -250,10 +250,12 @@ class Region(
|
|
250
250
|
# Crop to this region's bounds
|
251
251
|
spec.crop_bbox = self.bbox
|
252
252
|
|
253
|
-
# Add highlights in show mode
|
254
|
-
if mode == "show":
|
255
|
-
#
|
256
|
-
|
253
|
+
# Add highlights in show mode (unless explicitly disabled with highlights=False)
|
254
|
+
if mode == "show" and highlights is not False:
|
255
|
+
# Only highlight this region if:
|
256
|
+
# 1. We're not cropping, OR
|
257
|
+
# 2. We're cropping but color was explicitly specified
|
258
|
+
if not crop or color is not None:
|
257
259
|
spec.add_highlight(
|
258
260
|
bbox=self.bbox,
|
259
261
|
polygon=self.polygon if self.has_polygon else None,
|
@@ -261,8 +263,8 @@ class Region(
|
|
261
263
|
label=self.label or self.name or "Region",
|
262
264
|
)
|
263
265
|
|
264
|
-
# Add additional highlight groups if provided
|
265
|
-
if highlights:
|
266
|
+
# Add additional highlight groups if provided (and highlights is a list)
|
267
|
+
if highlights and isinstance(highlights, list):
|
266
268
|
for group in highlights:
|
267
269
|
elements = group.get("elements", [])
|
268
270
|
group_color = group.get("color", color)
|
@@ -0,0 +1,7 @@
|
|
1
|
+
"""Vision module for visual similarity and pattern matching"""
|
2
|
+
|
3
|
+
from .mixin import VisualSearchMixin
|
4
|
+
from .results import Match, MatchResults
|
5
|
+
from .similarity import VisualMatcher, compute_phash
|
6
|
+
|
7
|
+
__all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]
|
@@ -0,0 +1,209 @@
|
|
1
|
+
"""Mixin to add visual similarity search to Page/PDF/PDFCollection"""
|
2
|
+
|
3
|
+
from typing import List, Optional, Tuple, Union
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
from PIL import Image
|
7
|
+
from tqdm.auto import tqdm
|
8
|
+
|
9
|
+
from .results import Match, MatchResults
|
10
|
+
from .similarity import VisualMatcher, compute_phash
|
11
|
+
|
12
|
+
|
13
|
+
class VisualSearchMixin:
|
14
|
+
"""Add find_similar method to classes that include this mixin"""
|
15
|
+
|
16
|
+
def find_similar(
|
17
|
+
self,
|
18
|
+
examples: Union["Element", "Region", List[Union["Element", "Region"]]],
|
19
|
+
using: str = "vision",
|
20
|
+
confidence: float = 0.6,
|
21
|
+
sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
|
22
|
+
resolution: int = 72,
|
23
|
+
hash_size: int = 20,
|
24
|
+
step_factor: float = 0.1,
|
25
|
+
max_per_page: Optional[int] = None,
|
26
|
+
show_progress: bool = True,
|
27
|
+
**kwargs,
|
28
|
+
) -> MatchResults:
|
29
|
+
"""
|
30
|
+
Find regions visually similar to the given example(s).
|
31
|
+
|
32
|
+
Args:
|
33
|
+
examples: Single element/region or list of examples to search for
|
34
|
+
using: Search method - currently only 'vision' is supported
|
35
|
+
confidence: Minimum similarity score (0-1)
|
36
|
+
sizes: Size variations to search. Can be:
|
37
|
+
- float: ±percentage (e.g., 0.2 = 80%-120%)
|
38
|
+
- tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
|
39
|
+
- tuple(min, max, step): explicit step size
|
40
|
+
- list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
|
41
|
+
resolution: Resolution for image comparison (DPI) (default: 72)
|
42
|
+
hash_size: Size of perceptual hash grid (default: 12)
|
43
|
+
step_factor: Step size as fraction of template size (default: 0.1)
|
44
|
+
max_per_page: Maximum matches to return per page
|
45
|
+
show_progress: Show progress bar for multi-page searches (default: True)
|
46
|
+
**kwargs: Additional options
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
MatchResults collection
|
50
|
+
"""
|
51
|
+
if using != "vision":
|
52
|
+
raise NotImplementedError(f"using='{using}' not yet supported")
|
53
|
+
|
54
|
+
# Ensure examples is a list
|
55
|
+
if not isinstance(examples, list):
|
56
|
+
examples = [examples]
|
57
|
+
|
58
|
+
# Initialize matcher with specified hash size
|
59
|
+
matcher = VisualMatcher(hash_size=hash_size)
|
60
|
+
|
61
|
+
# Prepare templates
|
62
|
+
templates = []
|
63
|
+
for example in examples:
|
64
|
+
# Render the example region/element
|
65
|
+
example_image = example.render(resolution=resolution, crop=True)
|
66
|
+
template_hash = compute_phash(example_image, hash_size=hash_size)
|
67
|
+
templates.append({"image": example_image, "hash": template_hash, "source": example})
|
68
|
+
|
69
|
+
# Get pages to search based on the object type
|
70
|
+
if hasattr(self, "__class__") and self.__class__.__name__ == "PDFCollection":
|
71
|
+
# PDFCollection needs to iterate through all PDFs
|
72
|
+
pages_to_search = []
|
73
|
+
for pdf in self:
|
74
|
+
pages_to_search.extend(pdf.pages)
|
75
|
+
elif hasattr(self, "pages"): # PDF
|
76
|
+
pages_to_search = self.pages
|
77
|
+
elif hasattr(self, "number"): # Single page
|
78
|
+
pages_to_search = [self]
|
79
|
+
else:
|
80
|
+
raise TypeError(f"Cannot search in {type(self)}")
|
81
|
+
|
82
|
+
# Calculate total operations for progress bar
|
83
|
+
total_operations = 0
|
84
|
+
if show_progress:
|
85
|
+
# Get scales that will be searched
|
86
|
+
scales = matcher._get_search_scales(sizes)
|
87
|
+
|
88
|
+
# Pre-calculate for all pages and templates
|
89
|
+
for page in pages_to_search:
|
90
|
+
# Estimate page image size
|
91
|
+
page_w = int(page.width * resolution / 72.0)
|
92
|
+
page_h = int(page.height * resolution / 72.0)
|
93
|
+
|
94
|
+
for template_data in templates:
|
95
|
+
template_w, template_h = template_data["image"].size
|
96
|
+
|
97
|
+
for scale in scales:
|
98
|
+
scaled_w = int(template_w * scale)
|
99
|
+
scaled_h = int(template_h * scale)
|
100
|
+
|
101
|
+
if scaled_w <= page_w and scaled_h <= page_h:
|
102
|
+
step_x = max(1, int(scaled_w * step_factor))
|
103
|
+
step_y = max(1, int(scaled_h * step_factor))
|
104
|
+
|
105
|
+
x_windows = len(range(0, page_w - scaled_w + 1, step_x))
|
106
|
+
y_windows = len(range(0, page_h - scaled_h + 1, step_y))
|
107
|
+
total_operations += x_windows * y_windows
|
108
|
+
|
109
|
+
# Search each page
|
110
|
+
all_matches = []
|
111
|
+
|
112
|
+
# Create single progress bar for all operations
|
113
|
+
progress_bar = None
|
114
|
+
operations_done = 0
|
115
|
+
last_update = 0
|
116
|
+
update_frequency = max(1, total_operations // 1000) # Update at most 1000 times
|
117
|
+
|
118
|
+
if show_progress and total_operations > 0:
|
119
|
+
progress_bar = tqdm(
|
120
|
+
total=total_operations,
|
121
|
+
desc="Searching",
|
122
|
+
unit="window",
|
123
|
+
miniters=update_frequency, # Minimum iterations between updates
|
124
|
+
mininterval=0.1, # Minimum time between updates (seconds)
|
125
|
+
)
|
126
|
+
|
127
|
+
for page_idx, page in enumerate(pages_to_search):
|
128
|
+
# Render the full page once
|
129
|
+
page_image = page.render(resolution=resolution)
|
130
|
+
|
131
|
+
# Convert page coordinates to image coordinates
|
132
|
+
scale = resolution / 72.0 # PDF is 72 DPI
|
133
|
+
|
134
|
+
page_matches = []
|
135
|
+
|
136
|
+
# Search for each template
|
137
|
+
for template_idx, template_data in enumerate(templates):
|
138
|
+
template_image = template_data["image"]
|
139
|
+
template_hash = template_data["hash"]
|
140
|
+
|
141
|
+
# Custom progress callback to update our main progress bar
|
142
|
+
def update_progress():
|
143
|
+
nonlocal operations_done, last_update
|
144
|
+
operations_done += 1
|
145
|
+
|
146
|
+
# Only update progress bar every N operations to avoid overwhelming output
|
147
|
+
if progress_bar and (
|
148
|
+
operations_done - last_update >= update_frequency
|
149
|
+
or operations_done == total_operations
|
150
|
+
):
|
151
|
+
progress_bar.update(operations_done - last_update)
|
152
|
+
last_update = operations_done
|
153
|
+
|
154
|
+
# Update description with current page/template info
|
155
|
+
if len(pages_to_search) > 1:
|
156
|
+
progress_bar.set_description(
|
157
|
+
f"Page {page.number}/{len(pages_to_search)}"
|
158
|
+
)
|
159
|
+
elif len(templates) > 1:
|
160
|
+
progress_bar.set_description(
|
161
|
+
f"Template {template_idx + 1}/{len(templates)}"
|
162
|
+
)
|
163
|
+
|
164
|
+
# Find matches in this page - never show internal progress
|
165
|
+
candidates = matcher.find_matches_in_image(
|
166
|
+
template_image,
|
167
|
+
page_image,
|
168
|
+
template_hash=template_hash,
|
169
|
+
confidence_threshold=confidence,
|
170
|
+
sizes=sizes,
|
171
|
+
step_factor=step_factor,
|
172
|
+
show_progress=False, # We handle progress ourselves
|
173
|
+
progress_callback=update_progress if progress_bar else None,
|
174
|
+
**kwargs,
|
175
|
+
)
|
176
|
+
|
177
|
+
# Convert image coordinates back to PDF coordinates
|
178
|
+
for candidate in candidates:
|
179
|
+
img_x0, img_y0, img_x1, img_y1 = candidate.bbox
|
180
|
+
|
181
|
+
# Convert from image pixels to PDF points
|
182
|
+
# No flipping needed! PDF coordinates map directly to PIL coordinates
|
183
|
+
pdf_x0 = img_x0 / scale
|
184
|
+
pdf_y0 = img_y0 / scale
|
185
|
+
pdf_x1 = img_x1 / scale
|
186
|
+
pdf_y1 = img_y1 / scale
|
187
|
+
|
188
|
+
# Create Match object
|
189
|
+
match = Match(
|
190
|
+
page=page,
|
191
|
+
bbox=(pdf_x0, pdf_y0, pdf_x1, pdf_y1),
|
192
|
+
confidence=candidate.confidence,
|
193
|
+
source_example=template_data["source"],
|
194
|
+
)
|
195
|
+
page_matches.append(match)
|
196
|
+
|
197
|
+
# Apply max_per_page limit if specified
|
198
|
+
if max_per_page and len(page_matches) > max_per_page:
|
199
|
+
# Sort by confidence and take top N
|
200
|
+
page_matches.sort(key=lambda m: m.confidence, reverse=True)
|
201
|
+
page_matches = page_matches[:max_per_page]
|
202
|
+
|
203
|
+
all_matches.extend(page_matches)
|
204
|
+
|
205
|
+
# Close progress bar
|
206
|
+
if progress_bar:
|
207
|
+
progress_bar.close()
|
208
|
+
|
209
|
+
return MatchResults(all_matches)
|
@@ -0,0 +1,146 @@
|
|
1
|
+
"""Match results for visual similarity search"""
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple
|
4
|
+
|
5
|
+
# Import Region directly as it's a base class
|
6
|
+
from natural_pdf.elements.region import Region
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from natural_pdf.core.page_collection import PageCollection
|
10
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
11
|
+
|
12
|
+
|
13
|
+
class Match(Region):
|
14
|
+
"""A region that was found via visual similarity search"""
|
15
|
+
|
16
|
+
def __init__(self, page, bbox, confidence, source_example=None, metadata=None):
|
17
|
+
"""
|
18
|
+
Initialize a Match object.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
page: Page containing the match
|
22
|
+
bbox: Bounding box of the match
|
23
|
+
confidence: Similarity confidence (0-1)
|
24
|
+
source_example: The example/template that led to this match
|
25
|
+
metadata: Additional metadata about the match
|
26
|
+
"""
|
27
|
+
super().__init__(page, bbox)
|
28
|
+
self.confidence = confidence
|
29
|
+
self.source_example = source_example
|
30
|
+
self.metadata = metadata or {}
|
31
|
+
|
32
|
+
@property
|
33
|
+
def pdf(self):
|
34
|
+
"""Get the PDF containing this match"""
|
35
|
+
return self.page.pdf
|
36
|
+
|
37
|
+
def __repr__(self):
|
38
|
+
return f"<Match page={self.page.number} confidence={self.confidence:.2f} bbox={self.bbox}>"
|
39
|
+
|
40
|
+
|
41
|
+
class MatchResults:
|
42
|
+
"""Collection of Match objects with transformation methods"""
|
43
|
+
|
44
|
+
def __init__(self, matches: List[Match]):
|
45
|
+
"""Initialize with list of Match objects"""
|
46
|
+
# Import here to avoid circular import
|
47
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
48
|
+
|
49
|
+
# Create a base ElementCollection
|
50
|
+
self._collection = ElementCollection(matches)
|
51
|
+
self._matches = matches
|
52
|
+
|
53
|
+
def __len__(self):
|
54
|
+
return len(self._matches)
|
55
|
+
|
56
|
+
def __iter__(self):
|
57
|
+
return iter(self._matches)
|
58
|
+
|
59
|
+
def __getitem__(self, key):
|
60
|
+
return self._matches[key]
|
61
|
+
|
62
|
+
def filter(self, filter_func) -> "MatchResults":
|
63
|
+
"""Filter matches by a function"""
|
64
|
+
filtered = [m for m in self if filter_func(m)]
|
65
|
+
return MatchResults(filtered)
|
66
|
+
|
67
|
+
def filter_by_confidence(self, min_confidence: float) -> "MatchResults":
|
68
|
+
"""Filter matches by minimum confidence"""
|
69
|
+
return self.filter(lambda m: m.confidence >= min_confidence)
|
70
|
+
|
71
|
+
def pages(self):
|
72
|
+
"""Get unique pages containing matches"""
|
73
|
+
# Import here to avoid circular import
|
74
|
+
from natural_pdf.core.page_collection import PageCollection
|
75
|
+
|
76
|
+
# Get unique pages while preserving order
|
77
|
+
seen = set()
|
78
|
+
unique_pages = []
|
79
|
+
for match in self:
|
80
|
+
if match.page not in seen:
|
81
|
+
seen.add(match.page)
|
82
|
+
unique_pages.append(match.page)
|
83
|
+
|
84
|
+
# Attach matches to each page
|
85
|
+
for page in unique_pages:
|
86
|
+
page._matches = MatchResults([m for m in self if m.page == page])
|
87
|
+
|
88
|
+
return PageCollection(unique_pages)
|
89
|
+
|
90
|
+
def pdfs(self):
|
91
|
+
"""Get unique PDFs containing matches"""
|
92
|
+
# Import here to avoid circular import
|
93
|
+
from natural_pdf.core.pdf_collection import PDFCollection
|
94
|
+
|
95
|
+
# Get unique PDFs while preserving order
|
96
|
+
seen = set()
|
97
|
+
unique_pdfs = []
|
98
|
+
for match in self:
|
99
|
+
if match.pdf not in seen:
|
100
|
+
seen.add(match.pdf)
|
101
|
+
unique_pdfs.append(match.pdf)
|
102
|
+
|
103
|
+
# Attach matches to each PDF
|
104
|
+
for pdf in unique_pdfs:
|
105
|
+
pdf._matches = MatchResults([m for m in self if m.pdf == pdf])
|
106
|
+
|
107
|
+
return PDFCollection(unique_pdfs)
|
108
|
+
|
109
|
+
def group_by_page(self) -> Iterator[Tuple[Any, "MatchResults"]]:
|
110
|
+
"""Group matches by page"""
|
111
|
+
from itertools import groupby
|
112
|
+
|
113
|
+
# Sort by PDF filename and page number
|
114
|
+
sorted_matches = sorted(self, key=lambda m: (getattr(m.pdf, "filename", ""), m.page.number))
|
115
|
+
|
116
|
+
for page, matches in groupby(sorted_matches, key=lambda m: m.page):
|
117
|
+
yield page, MatchResults(list(matches))
|
118
|
+
|
119
|
+
def sort_by_confidence(self, descending: bool = True) -> "MatchResults":
|
120
|
+
"""Sort matches by confidence score"""
|
121
|
+
sorted_matches = sorted(self, key=lambda m: m.confidence, reverse=descending)
|
122
|
+
return MatchResults(sorted_matches)
|
123
|
+
|
124
|
+
def regions(self):
|
125
|
+
"""Get all matches as an ElementCollection of regions"""
|
126
|
+
# Import here to avoid circular import
|
127
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
128
|
+
|
129
|
+
# Matches are already Region objects, so just wrap them
|
130
|
+
return ElementCollection(list(self))
|
131
|
+
|
132
|
+
def show(self, **kwargs):
|
133
|
+
"""Show all matches using ElementCollection.show()"""
|
134
|
+
# Get regions and show them
|
135
|
+
return self.regions().show(**kwargs)
|
136
|
+
|
137
|
+
def __repr__(self):
|
138
|
+
if len(self) == 0:
|
139
|
+
return "<MatchResults: empty>"
|
140
|
+
elif len(self) == 1:
|
141
|
+
return f"<MatchResults: 1 match>"
|
142
|
+
else:
|
143
|
+
conf_range = (
|
144
|
+
f"{min(m.confidence for m in self):.2f}-{max(m.confidence for m in self):.2f}"
|
145
|
+
)
|
146
|
+
return f"<MatchResults: {len(self)} matches, confidence {conf_range}>"
|
@@ -0,0 +1,321 @@
|
|
1
|
+
"""Visual similarity matching using perceptual hashing"""
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import Callable, List, Optional, Tuple, Union
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
from PIL import Image
|
8
|
+
from tqdm.auto import tqdm
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass
|
12
|
+
class MatchCandidate:
|
13
|
+
"""Candidate match during sliding window search"""
|
14
|
+
|
15
|
+
bbox: Tuple[float, float, float, float]
|
16
|
+
hash_value: int
|
17
|
+
confidence: float
|
18
|
+
|
19
|
+
|
20
|
+
def compute_phash(image: Image.Image, hash_size: int = 8, blur_radius: float = 0) -> int:
|
21
|
+
"""
|
22
|
+
Compute perceptual hash of an image using DCT.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
image: PIL Image to hash
|
26
|
+
hash_size: Size of the hash (8 = 64 bit hash)
|
27
|
+
blur_radius: Optional blur to apply before hashing (makes more tolerant)
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
Integer hash value
|
31
|
+
"""
|
32
|
+
# Convert to grayscale
|
33
|
+
if image.mode != "L":
|
34
|
+
image = image.convert("L")
|
35
|
+
|
36
|
+
# Optional blur to reduce sensitivity to minor variations
|
37
|
+
if blur_radius > 0:
|
38
|
+
from PIL import ImageFilter
|
39
|
+
|
40
|
+
image = image.filter(ImageFilter.GaussianBlur(radius=blur_radius))
|
41
|
+
|
42
|
+
# Resize to 32x32 (4x the hash size for DCT)
|
43
|
+
highfreq_factor = 4
|
44
|
+
img_size = hash_size * highfreq_factor
|
45
|
+
image = image.resize((img_size, img_size), Image.Resampling.LANCZOS)
|
46
|
+
|
47
|
+
# Convert to numpy array
|
48
|
+
pixels = np.array(image, dtype=np.float32)
|
49
|
+
|
50
|
+
# Apply DCT
|
51
|
+
from scipy.fftpack import dct
|
52
|
+
|
53
|
+
dct_coef = dct(dct(pixels, axis=0), axis=1)
|
54
|
+
|
55
|
+
# Keep top-left 8x8 (low frequencies)
|
56
|
+
dct_low = dct_coef[:hash_size, :hash_size]
|
57
|
+
|
58
|
+
# Compute median excluding the DC component
|
59
|
+
dct_low_no_dc = dct_low.flatten()[1:] # Skip first element (DC)
|
60
|
+
median = np.median(dct_low_no_dc)
|
61
|
+
|
62
|
+
# Create binary hash
|
63
|
+
diff = dct_low.flatten() > median
|
64
|
+
|
65
|
+
# Convert to integer
|
66
|
+
return sum(2**i for i, v in enumerate(diff) if v)
|
67
|
+
|
68
|
+
|
69
|
+
def hamming_distance(hash1: int, hash2: int, hash_size: int = 64) -> int:
|
70
|
+
"""Calculate Hamming distance between two hashes"""
|
71
|
+
# XOR and count set bits
|
72
|
+
xor = hash1 ^ hash2
|
73
|
+
return bin(xor).count("1")
|
74
|
+
|
75
|
+
|
76
|
+
def hash_similarity(hash1: int, hash2: int, hash_size: int = 64) -> float:
|
77
|
+
"""Calculate similarity score between two hashes (0-1)"""
|
78
|
+
distance = hamming_distance(hash1, hash2, hash_size)
|
79
|
+
return 1.0 - (distance / hash_size)
|
80
|
+
|
81
|
+
|
82
|
+
class VisualMatcher:
|
83
|
+
"""Handles visual similarity matching using perceptual hashing"""
|
84
|
+
|
85
|
+
def __init__(self, hash_size: int = 12):
|
86
|
+
self.hash_size = hash_size
|
87
|
+
self.hash_bits = hash_size * hash_size
|
88
|
+
self._cache = {}
|
89
|
+
|
90
|
+
def _get_search_scales(self, sizes: Optional[Union[float, Tuple, List]]) -> List[float]:
|
91
|
+
"""
|
92
|
+
Convert various size input formats to a list of scales to search.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
sizes: Can be:
|
96
|
+
- None: just 1.0
|
97
|
+
- float: ±percentage (e.g., 0.2 = 80%-120%)
|
98
|
+
- tuple(min, max): range with smart logarithmic steps
|
99
|
+
- tuple(min, max, step): explicit step size
|
100
|
+
- list: exact sizes to use
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
List of scale factors to search
|
104
|
+
"""
|
105
|
+
if sizes is None:
|
106
|
+
return [1.0]
|
107
|
+
|
108
|
+
# List of exact sizes
|
109
|
+
if isinstance(sizes, list):
|
110
|
+
return sorted(sizes)
|
111
|
+
|
112
|
+
# Single float: ±percentage
|
113
|
+
if isinstance(sizes, (int, float)):
|
114
|
+
if sizes <= 0:
|
115
|
+
return [1.0]
|
116
|
+
# Convert to min/max range
|
117
|
+
min_scale = max(0.1, 1.0 - sizes)
|
118
|
+
max_scale = 1.0 + sizes
|
119
|
+
# Use tuple logic below
|
120
|
+
sizes = (min_scale, max_scale)
|
121
|
+
|
122
|
+
# Tuple handling
|
123
|
+
if isinstance(sizes, tuple):
|
124
|
+
if len(sizes) == 2:
|
125
|
+
min_scale, max_scale = sizes
|
126
|
+
if min_scale >= max_scale:
|
127
|
+
return [min_scale]
|
128
|
+
|
129
|
+
# Smart defaults with logarithmic spacing
|
130
|
+
# Calculate range ratio to determine number of steps
|
131
|
+
ratio = max_scale / min_scale
|
132
|
+
|
133
|
+
if ratio <= 1.5: # Small range (e.g., 0.8-1.2)
|
134
|
+
num_steps = 5
|
135
|
+
elif ratio <= 3.0: # Medium range (e.g., 0.5-1.5)
|
136
|
+
num_steps = 7
|
137
|
+
else: # Large range (e.g., 0.5-2.0)
|
138
|
+
num_steps = 9
|
139
|
+
|
140
|
+
# Generate logarithmically spaced scales
|
141
|
+
log_min = np.log(min_scale)
|
142
|
+
log_max = np.log(max_scale)
|
143
|
+
log_scales = np.linspace(log_min, log_max, num_steps)
|
144
|
+
scales = np.exp(log_scales).tolist()
|
145
|
+
|
146
|
+
# Ensure 1.0 is included if in range
|
147
|
+
if min_scale <= 1.0 <= max_scale and 1.0 not in scales:
|
148
|
+
# Find closest scale and replace with 1.0
|
149
|
+
closest_idx = np.argmin([abs(s - 1.0) for s in scales])
|
150
|
+
scales[closest_idx] = 1.0
|
151
|
+
|
152
|
+
return scales
|
153
|
+
|
154
|
+
elif len(sizes) == 3:
|
155
|
+
# Explicit (min, max, step)
|
156
|
+
min_scale, max_scale, step = sizes
|
157
|
+
scales = []
|
158
|
+
current = min_scale
|
159
|
+
while current <= max_scale:
|
160
|
+
scales.append(current)
|
161
|
+
current += step
|
162
|
+
# Ensure max is included if close
|
163
|
+
if scales[-1] < max_scale and (max_scale - scales[-1]) < step * 0.1:
|
164
|
+
scales[-1] = max_scale
|
165
|
+
return scales
|
166
|
+
|
167
|
+
raise ValueError(f"Invalid sizes format: {sizes}")
|
168
|
+
|
169
|
+
def find_matches_in_image(
|
170
|
+
self,
|
171
|
+
template: Image.Image,
|
172
|
+
target: Image.Image,
|
173
|
+
template_hash: Optional[int] = None,
|
174
|
+
confidence_threshold: float = 0.6,
|
175
|
+
step_factor: float = 0.1,
|
176
|
+
sizes: Optional[Union[float, Tuple, List]] = None,
|
177
|
+
show_progress: bool = True,
|
178
|
+
progress_callback: Optional[Callable[[], None]] = None,
|
179
|
+
) -> List[MatchCandidate]:
|
180
|
+
"""
|
181
|
+
Find all matches of template in target image using sliding window.
|
182
|
+
|
183
|
+
Args:
|
184
|
+
template: Template image to search for
|
185
|
+
target: Target image to search in
|
186
|
+
template_hash: Pre-computed hash of template (optional)
|
187
|
+
confidence_threshold: Minimum similarity score (0-1)
|
188
|
+
step_factor: Step size as fraction of template size
|
189
|
+
sizes: Size variations to search. Can be:
|
190
|
+
- float: ±percentage (e.g., 0.2 = 80%-120%)
|
191
|
+
- tuple(min, max): search range with smart logarithmic steps
|
192
|
+
- tuple(min, max, step): explicit step size
|
193
|
+
- list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
|
194
|
+
show_progress: Show progress bar for sliding window search
|
195
|
+
progress_callback: Optional callback function to call for each window checked
|
196
|
+
|
197
|
+
Returns:
|
198
|
+
List of MatchCandidate objects
|
199
|
+
"""
|
200
|
+
matches = []
|
201
|
+
|
202
|
+
# Compute template hash if not provided
|
203
|
+
if template_hash is None:
|
204
|
+
template_hash = compute_phash(template, self.hash_size)
|
205
|
+
|
206
|
+
template_w, template_h = template.size
|
207
|
+
target_w, target_h = target.size
|
208
|
+
|
209
|
+
# Determine scales to search
|
210
|
+
scales = self._get_search_scales(sizes)
|
211
|
+
|
212
|
+
# Calculate total iterations for progress bar
|
213
|
+
total_iterations = 0
|
214
|
+
if show_progress and not progress_callback:
|
215
|
+
for scale in scales:
|
216
|
+
scaled_w = int(template_w * scale)
|
217
|
+
scaled_h = int(template_h * scale)
|
218
|
+
if scaled_w <= target_w and scaled_h <= target_h:
|
219
|
+
step_x = max(1, int(scaled_w * step_factor))
|
220
|
+
step_y = max(1, int(scaled_h * step_factor))
|
221
|
+
x_steps = len(range(0, target_w - scaled_w + 1, step_x))
|
222
|
+
y_steps = len(range(0, target_h - scaled_h + 1, step_y))
|
223
|
+
total_iterations += x_steps * y_steps
|
224
|
+
|
225
|
+
# Setup progress bar if needed (only if no callback provided)
|
226
|
+
progress_bar = None
|
227
|
+
if show_progress and not progress_callback and total_iterations > 0:
|
228
|
+
progress_bar = tqdm(total=total_iterations, desc="Scanning", unit="window", leave=False)
|
229
|
+
|
230
|
+
# Search at each scale
|
231
|
+
for scale in scales:
|
232
|
+
# Scale template size
|
233
|
+
scaled_w = int(template_w * scale)
|
234
|
+
scaled_h = int(template_h * scale)
|
235
|
+
|
236
|
+
if scaled_w > target_w or scaled_h > target_h:
|
237
|
+
continue
|
238
|
+
|
239
|
+
# Calculate step size
|
240
|
+
step_x = max(1, int(scaled_w * step_factor))
|
241
|
+
step_y = max(1, int(scaled_h * step_factor))
|
242
|
+
|
243
|
+
# Sliding window search
|
244
|
+
for y in range(0, target_h - scaled_h + 1, step_y):
|
245
|
+
for x in range(0, target_w - scaled_w + 1, step_x):
|
246
|
+
# Extract window
|
247
|
+
window = target.crop((x, y, x + scaled_w, y + scaled_h))
|
248
|
+
|
249
|
+
# Resize to template size if scaled
|
250
|
+
if scale != 1.0:
|
251
|
+
window = window.resize((template_w, template_h), Image.Resampling.LANCZOS)
|
252
|
+
|
253
|
+
# Compute hash and similarity
|
254
|
+
window_hash = compute_phash(window, self.hash_size)
|
255
|
+
similarity = hash_similarity(template_hash, window_hash, self.hash_bits)
|
256
|
+
|
257
|
+
if similarity >= confidence_threshold:
|
258
|
+
# Convert back to target image coordinates
|
259
|
+
bbox = (x, y, x + scaled_w, y + scaled_h)
|
260
|
+
matches.append(MatchCandidate(bbox, window_hash, similarity))
|
261
|
+
|
262
|
+
# Update progress
|
263
|
+
if progress_bar:
|
264
|
+
progress_bar.update(1)
|
265
|
+
elif progress_callback:
|
266
|
+
progress_callback()
|
267
|
+
|
268
|
+
# Close progress bar
|
269
|
+
if progress_bar:
|
270
|
+
progress_bar.close()
|
271
|
+
|
272
|
+
# Remove overlapping matches (keep highest confidence)
|
273
|
+
return self._filter_overlapping_matches(matches)
|
274
|
+
|
275
|
+
def _filter_overlapping_matches(
|
276
|
+
self, matches: List[MatchCandidate], overlap_threshold: float = 0.5
|
277
|
+
) -> List[MatchCandidate]:
|
278
|
+
"""Remove overlapping matches, keeping the highest confidence ones"""
|
279
|
+
if not matches:
|
280
|
+
return matches
|
281
|
+
|
282
|
+
# Sort by confidence (highest first)
|
283
|
+
sorted_matches = sorted(matches, key=lambda m: m.confidence, reverse=True)
|
284
|
+
filtered = []
|
285
|
+
|
286
|
+
for candidate in sorted_matches:
|
287
|
+
# Check if this overlaps significantly with any already selected match
|
288
|
+
keep = True
|
289
|
+
for selected in filtered:
|
290
|
+
overlap = self._calculate_overlap(candidate.bbox, selected.bbox)
|
291
|
+
if overlap > overlap_threshold:
|
292
|
+
keep = False
|
293
|
+
break
|
294
|
+
|
295
|
+
if keep:
|
296
|
+
filtered.append(candidate)
|
297
|
+
|
298
|
+
return filtered
|
299
|
+
|
300
|
+
def _calculate_overlap(self, bbox1: Tuple, bbox2: Tuple) -> float:
|
301
|
+
"""Calculate intersection over union (IoU) for two bboxes"""
|
302
|
+
x1_min, y1_min, x1_max, y1_max = bbox1
|
303
|
+
x2_min, y2_min, x2_max, y2_max = bbox2
|
304
|
+
|
305
|
+
# Calculate intersection
|
306
|
+
intersect_xmin = max(x1_min, x2_min)
|
307
|
+
intersect_ymin = max(y1_min, y2_min)
|
308
|
+
intersect_xmax = min(x1_max, x2_max)
|
309
|
+
intersect_ymax = min(y1_max, y2_max)
|
310
|
+
|
311
|
+
if intersect_xmax < intersect_xmin or intersect_ymax < intersect_ymin:
|
312
|
+
return 0.0
|
313
|
+
|
314
|
+
intersect_area = (intersect_xmax - intersect_xmin) * (intersect_ymax - intersect_ymin)
|
315
|
+
|
316
|
+
# Calculate union
|
317
|
+
area1 = (x1_max - x1_min) * (y1_max - y1_min)
|
318
|
+
area2 = (x2_max - x2_min) * (y2_max - y2_min)
|
319
|
+
union_area = area1 + area2 - intersect_area
|
320
|
+
|
321
|
+
return intersect_area / union_area if union_area > 0 else 0.0
|
@@ -27,24 +27,24 @@ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666
|
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
|
29
29
|
natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
|
30
|
-
natural_pdf/core/page.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=XrDePXZgXgB3w8hvxh4-EhPQnrwmw-0z-I_K24__OtY,142550
|
31
31
|
natural_pdf/core/page_collection.py,sha256=hEeXs_fzB73XZ8ZkHz2kIuSgBYcVYydvGMMdGuB1rvw,52486
|
32
32
|
natural_pdf/core/page_groupby.py,sha256=550ME6kd-h-2u75oUIIIqTYsmh8VvdQO1nXXioL8J6A,7378
|
33
|
-
natural_pdf/core/pdf.py,sha256=
|
34
|
-
natural_pdf/core/pdf_collection.py,sha256=
|
35
|
-
natural_pdf/core/render_spec.py,sha256=
|
33
|
+
natural_pdf/core/pdf.py,sha256=Loe6sbQzBp9VDeIAuDS3zQmeDWvQMj5SWIQMky5bPDA,101964
|
34
|
+
natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
|
35
|
+
natural_pdf/core/render_spec.py,sha256=rLicaS9EPyojpJcjy2Lzn5DLWQwjrFyDJyRo7jbjdGU,14505
|
36
36
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
37
37
|
natural_pdf/describe/base.py,sha256=Of9WVo9XuShXoeyJr0RN2CpLhF_CeiOjazl-or53RKU,18173
|
38
38
|
natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
|
39
39
|
natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
|
40
40
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
41
41
|
natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
|
42
|
-
natural_pdf/elements/base.py,sha256=
|
42
|
+
natural_pdf/elements/base.py,sha256=aj-eXOQQlhKv9lYeUlUs9aKNcUebtG_dqxURZHZVZ58,55509
|
43
43
|
natural_pdf/elements/element_collection.py,sha256=slCUnOT04sNOTjSGgmhjcCKKPVPtdDPwU7PX1ebzGMw,101342
|
44
44
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
45
45
|
natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
|
46
46
|
natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
|
47
|
-
natural_pdf/elements/region.py,sha256=
|
47
|
+
natural_pdf/elements/region.py,sha256=RxWidI7oNrdbuuj94SfdFXmcSDTfy89uGCeVMQvAfks,155591
|
48
48
|
natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
|
49
49
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
50
50
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
@@ -100,9 +100,13 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
|
|
100
100
|
natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
|
101
101
|
natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
|
102
102
|
natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
|
103
|
+
natural_pdf/vision/__init__.py,sha256=RymMY-3WLQBlOZ4Dx4MmL9UH6I65hNjkwUJ7ymO5JfM,287
|
104
|
+
natural_pdf/vision/mixin.py,sha256=OJwBABr74TWxP5seTKUmGj5zE9mWsBP_UKWU-Pr8V9A,8720
|
105
|
+
natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs,5119
|
106
|
+
natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
|
103
107
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
104
108
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
105
|
-
natural_pdf-0.2.
|
109
|
+
natural_pdf-0.2.4.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
106
110
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
107
111
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
108
112
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
@@ -119,8 +123,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
119
123
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
120
124
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
121
125
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
122
|
-
natural_pdf-0.2.
|
123
|
-
natural_pdf-0.2.
|
124
|
-
natural_pdf-0.2.
|
125
|
-
natural_pdf-0.2.
|
126
|
-
natural_pdf-0.2.
|
126
|
+
natural_pdf-0.2.4.dist-info/METADATA,sha256=G1tmes61GVEt6zLeDISuJZgceLQywIU-uRspGA_90Q8,6959
|
127
|
+
natural_pdf-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
128
|
+
natural_pdf-0.2.4.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
129
|
+
natural_pdf-0.2.4.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
|
130
|
+
natural_pdf-0.2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|