natural-pdf 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,321 @@
1
+ """Visual similarity matching using perceptual hashing"""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Callable, List, Optional, Tuple, Union
5
+
6
+ import numpy as np
7
+ from PIL import Image
8
+ from tqdm.auto import tqdm
9
+
10
+
11
+ @dataclass
12
+ class MatchCandidate:
13
+ """Candidate match during sliding window search"""
14
+
15
+ bbox: Tuple[float, float, float, float]
16
+ hash_value: int
17
+ confidence: float
18
+
19
+
20
+ def compute_phash(image: Image.Image, hash_size: int = 8, blur_radius: float = 0) -> int:
21
+ """
22
+ Compute perceptual hash of an image using DCT.
23
+
24
+ Args:
25
+ image: PIL Image to hash
26
+ hash_size: Size of the hash (8 = 64 bit hash)
27
+ blur_radius: Optional blur to apply before hashing (makes more tolerant)
28
+
29
+ Returns:
30
+ Integer hash value
31
+ """
32
+ # Convert to grayscale
33
+ if image.mode != "L":
34
+ image = image.convert("L")
35
+
36
+ # Optional blur to reduce sensitivity to minor variations
37
+ if blur_radius > 0:
38
+ from PIL import ImageFilter
39
+
40
+ image = image.filter(ImageFilter.GaussianBlur(radius=blur_radius))
41
+
42
+ # Resize to 32x32 (4x the hash size for DCT)
43
+ highfreq_factor = 4
44
+ img_size = hash_size * highfreq_factor
45
+ image = image.resize((img_size, img_size), Image.Resampling.LANCZOS)
46
+
47
+ # Convert to numpy array
48
+ pixels = np.array(image, dtype=np.float32)
49
+
50
+ # Apply DCT
51
+ from scipy.fftpack import dct
52
+
53
+ dct_coef = dct(dct(pixels, axis=0), axis=1)
54
+
55
+ # Keep top-left 8x8 (low frequencies)
56
+ dct_low = dct_coef[:hash_size, :hash_size]
57
+
58
+ # Compute median excluding the DC component
59
+ dct_low_no_dc = dct_low.flatten()[1:] # Skip first element (DC)
60
+ median = np.median(dct_low_no_dc)
61
+
62
+ # Create binary hash
63
+ diff = dct_low.flatten() > median
64
+
65
+ # Convert to integer
66
+ return sum(2**i for i, v in enumerate(diff) if v)
67
+
68
+
69
+ def hamming_distance(hash1: int, hash2: int, hash_size: int = 64) -> int:
70
+ """Calculate Hamming distance between two hashes"""
71
+ # XOR and count set bits
72
+ xor = hash1 ^ hash2
73
+ return bin(xor).count("1")
74
+
75
+
76
+ def hash_similarity(hash1: int, hash2: int, hash_size: int = 64) -> float:
77
+ """Calculate similarity score between two hashes (0-1)"""
78
+ distance = hamming_distance(hash1, hash2, hash_size)
79
+ return 1.0 - (distance / hash_size)
80
+
81
+
82
+ class VisualMatcher:
83
+ """Handles visual similarity matching using perceptual hashing"""
84
+
85
+ def __init__(self, hash_size: int = 12):
86
+ self.hash_size = hash_size
87
+ self.hash_bits = hash_size * hash_size
88
+ self._cache = {}
89
+
90
+ def _get_search_scales(self, sizes: Optional[Union[float, Tuple, List]]) -> List[float]:
91
+ """
92
+ Convert various size input formats to a list of scales to search.
93
+
94
+ Args:
95
+ sizes: Can be:
96
+ - None: just 1.0
97
+ - float: ±percentage (e.g., 0.2 = 80%-120%)
98
+ - tuple(min, max): range with smart logarithmic steps
99
+ - tuple(min, max, step): explicit step size
100
+ - list: exact sizes to use
101
+
102
+ Returns:
103
+ List of scale factors to search
104
+ """
105
+ if sizes is None:
106
+ return [1.0]
107
+
108
+ # List of exact sizes
109
+ if isinstance(sizes, list):
110
+ return sorted(sizes)
111
+
112
+ # Single float: ±percentage
113
+ if isinstance(sizes, (int, float)):
114
+ if sizes <= 0:
115
+ return [1.0]
116
+ # Convert to min/max range
117
+ min_scale = max(0.1, 1.0 - sizes)
118
+ max_scale = 1.0 + sizes
119
+ # Use tuple logic below
120
+ sizes = (min_scale, max_scale)
121
+
122
+ # Tuple handling
123
+ if isinstance(sizes, tuple):
124
+ if len(sizes) == 2:
125
+ min_scale, max_scale = sizes
126
+ if min_scale >= max_scale:
127
+ return [min_scale]
128
+
129
+ # Smart defaults with logarithmic spacing
130
+ # Calculate range ratio to determine number of steps
131
+ ratio = max_scale / min_scale
132
+
133
+ if ratio <= 1.5: # Small range (e.g., 0.8-1.2)
134
+ num_steps = 5
135
+ elif ratio <= 3.0: # Medium range (e.g., 0.5-1.5)
136
+ num_steps = 7
137
+ else: # Large range (e.g., 0.5-2.0)
138
+ num_steps = 9
139
+
140
+ # Generate logarithmically spaced scales
141
+ log_min = np.log(min_scale)
142
+ log_max = np.log(max_scale)
143
+ log_scales = np.linspace(log_min, log_max, num_steps)
144
+ scales = np.exp(log_scales).tolist()
145
+
146
+ # Ensure 1.0 is included if in range
147
+ if min_scale <= 1.0 <= max_scale and 1.0 not in scales:
148
+ # Find closest scale and replace with 1.0
149
+ closest_idx = np.argmin([abs(s - 1.0) for s in scales])
150
+ scales[closest_idx] = 1.0
151
+
152
+ return scales
153
+
154
+ elif len(sizes) == 3:
155
+ # Explicit (min, max, step)
156
+ min_scale, max_scale, step = sizes
157
+ scales = []
158
+ current = min_scale
159
+ while current <= max_scale:
160
+ scales.append(current)
161
+ current += step
162
+ # Ensure max is included if close
163
+ if scales[-1] < max_scale and (max_scale - scales[-1]) < step * 0.1:
164
+ scales[-1] = max_scale
165
+ return scales
166
+
167
+ raise ValueError(f"Invalid sizes format: {sizes}")
168
+
169
+ def find_matches_in_image(
170
+ self,
171
+ template: Image.Image,
172
+ target: Image.Image,
173
+ template_hash: Optional[int] = None,
174
+ confidence_threshold: float = 0.6,
175
+ step_factor: float = 0.1,
176
+ sizes: Optional[Union[float, Tuple, List]] = None,
177
+ show_progress: bool = True,
178
+ progress_callback: Optional[Callable[[], None]] = None,
179
+ ) -> List[MatchCandidate]:
180
+ """
181
+ Find all matches of template in target image using sliding window.
182
+
183
+ Args:
184
+ template: Template image to search for
185
+ target: Target image to search in
186
+ template_hash: Pre-computed hash of template (optional)
187
+ confidence_threshold: Minimum similarity score (0-1)
188
+ step_factor: Step size as fraction of template size
189
+ sizes: Size variations to search. Can be:
190
+ - float: ±percentage (e.g., 0.2 = 80%-120%)
191
+ - tuple(min, max): search range with smart logarithmic steps
192
+ - tuple(min, max, step): explicit step size
193
+ - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
194
+ show_progress: Show progress bar for sliding window search
195
+ progress_callback: Optional callback function to call for each window checked
196
+
197
+ Returns:
198
+ List of MatchCandidate objects
199
+ """
200
+ matches = []
201
+
202
+ # Compute template hash if not provided
203
+ if template_hash is None:
204
+ template_hash = compute_phash(template, self.hash_size)
205
+
206
+ template_w, template_h = template.size
207
+ target_w, target_h = target.size
208
+
209
+ # Determine scales to search
210
+ scales = self._get_search_scales(sizes)
211
+
212
+ # Calculate total iterations for progress bar
213
+ total_iterations = 0
214
+ if show_progress and not progress_callback:
215
+ for scale in scales:
216
+ scaled_w = int(template_w * scale)
217
+ scaled_h = int(template_h * scale)
218
+ if scaled_w <= target_w and scaled_h <= target_h:
219
+ step_x = max(1, int(scaled_w * step_factor))
220
+ step_y = max(1, int(scaled_h * step_factor))
221
+ x_steps = len(range(0, target_w - scaled_w + 1, step_x))
222
+ y_steps = len(range(0, target_h - scaled_h + 1, step_y))
223
+ total_iterations += x_steps * y_steps
224
+
225
+ # Setup progress bar if needed (only if no callback provided)
226
+ progress_bar = None
227
+ if show_progress and not progress_callback and total_iterations > 0:
228
+ progress_bar = tqdm(total=total_iterations, desc="Scanning", unit="window", leave=False)
229
+
230
+ # Search at each scale
231
+ for scale in scales:
232
+ # Scale template size
233
+ scaled_w = int(template_w * scale)
234
+ scaled_h = int(template_h * scale)
235
+
236
+ if scaled_w > target_w or scaled_h > target_h:
237
+ continue
238
+
239
+ # Calculate step size
240
+ step_x = max(1, int(scaled_w * step_factor))
241
+ step_y = max(1, int(scaled_h * step_factor))
242
+
243
+ # Sliding window search
244
+ for y in range(0, target_h - scaled_h + 1, step_y):
245
+ for x in range(0, target_w - scaled_w + 1, step_x):
246
+ # Extract window
247
+ window = target.crop((x, y, x + scaled_w, y + scaled_h))
248
+
249
+ # Resize to template size if scaled
250
+ if scale != 1.0:
251
+ window = window.resize((template_w, template_h), Image.Resampling.LANCZOS)
252
+
253
+ # Compute hash and similarity
254
+ window_hash = compute_phash(window, self.hash_size)
255
+ similarity = hash_similarity(template_hash, window_hash, self.hash_bits)
256
+
257
+ if similarity >= confidence_threshold:
258
+ # Convert back to target image coordinates
259
+ bbox = (x, y, x + scaled_w, y + scaled_h)
260
+ matches.append(MatchCandidate(bbox, window_hash, similarity))
261
+
262
+ # Update progress
263
+ if progress_bar:
264
+ progress_bar.update(1)
265
+ elif progress_callback:
266
+ progress_callback()
267
+
268
+ # Close progress bar
269
+ if progress_bar:
270
+ progress_bar.close()
271
+
272
+ # Remove overlapping matches (keep highest confidence)
273
+ return self._filter_overlapping_matches(matches)
274
+
275
+ def _filter_overlapping_matches(
276
+ self, matches: List[MatchCandidate], overlap_threshold: float = 0.5
277
+ ) -> List[MatchCandidate]:
278
+ """Remove overlapping matches, keeping the highest confidence ones"""
279
+ if not matches:
280
+ return matches
281
+
282
+ # Sort by confidence (highest first)
283
+ sorted_matches = sorted(matches, key=lambda m: m.confidence, reverse=True)
284
+ filtered = []
285
+
286
+ for candidate in sorted_matches:
287
+ # Check if this overlaps significantly with any already selected match
288
+ keep = True
289
+ for selected in filtered:
290
+ overlap = self._calculate_overlap(candidate.bbox, selected.bbox)
291
+ if overlap > overlap_threshold:
292
+ keep = False
293
+ break
294
+
295
+ if keep:
296
+ filtered.append(candidate)
297
+
298
+ return filtered
299
+
300
+ def _calculate_overlap(self, bbox1: Tuple, bbox2: Tuple) -> float:
301
+ """Calculate intersection over union (IoU) for two bboxes"""
302
+ x1_min, y1_min, x1_max, y1_max = bbox1
303
+ x2_min, y2_min, x2_max, y2_max = bbox2
304
+
305
+ # Calculate intersection
306
+ intersect_xmin = max(x1_min, x2_min)
307
+ intersect_ymin = max(y1_min, y2_min)
308
+ intersect_xmax = min(x1_max, x2_max)
309
+ intersect_ymax = min(y1_max, y2_max)
310
+
311
+ if intersect_xmax < intersect_xmin or intersect_ymax < intersect_ymin:
312
+ return 0.0
313
+
314
+ intersect_area = (intersect_xmax - intersect_xmin) * (intersect_ymax - intersect_ymin)
315
+
316
+ # Calculate union
317
+ area1 = (x1_max - x1_min) * (y1_max - y1_min)
318
+ area2 = (x2_max - x2_min) * (y2_max - y2_min)
319
+ union_area = area1 + area2 - intersect_area
320
+
321
+ return intersect_area / union_area if union_area > 0 else 0.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -23,28 +23,28 @@ natural_pdf/analyzers/layout/yolo.py,sha256=2Iz2-WsMy--ftkZQ8j5PGqp_1fTD7Mskl2kN
23
23
  natural_pdf/classification/manager.py,sha256=BaqBL9GeMvYgoJsiQeI2J8aUKQ5Qxu_ELRvmCWquld8,22172
24
24
  natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGvLokS2w,9416
25
25
  natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
26
- natural_pdf/collections/mixins.py,sha256=u4KtnlUZZYQ74e0OXAniOv9RtuA6FhwBxsLMJLjdbpQ,5169
26
+ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
28
  natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
29
- natural_pdf/core/highlighting_service.py,sha256=mhHEomIlHQM1lVmEcUMJ4xtHsvCRb3rMroW1d1Gqs-M,67942
30
- natural_pdf/core/page.py,sha256=M_V0IXahopIN45ENmvIm4m_WdnrjYECmXe7xhUQtjQI,142455
31
- natural_pdf/core/page_collection.py,sha256=coVUsp4uLR2GImLbuGFpBIYcU952eJLfBQNgTmkOSzU,52486
29
+ natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
30
+ natural_pdf/core/page.py,sha256=XrDePXZgXgB3w8hvxh4-EhPQnrwmw-0z-I_K24__OtY,142550
31
+ natural_pdf/core/page_collection.py,sha256=hEeXs_fzB73XZ8ZkHz2kIuSgBYcVYydvGMMdGuB1rvw,52486
32
32
  natural_pdf/core/page_groupby.py,sha256=550ME6kd-h-2u75oUIIIqTYsmh8VvdQO1nXXioL8J6A,7378
33
- natural_pdf/core/pdf.py,sha256=q54DyhXwAS_zAmsBd3PsCezu1wyQOYmGmB3iKfP8gAM,101884
34
- natural_pdf/core/pdf_collection.py,sha256=8tM0qVWS1L5Hwv5cXuZ2X8znAYOjKmlERX62bksDlJU,30144
35
- natural_pdf/core/render_spec.py,sha256=SgT6bHR3yduZ_-JhFWRFmakUD74NPQJ8q1lY6iB3prQ,12916
33
+ natural_pdf/core/pdf.py,sha256=Loe6sbQzBp9VDeIAuDS3zQmeDWvQMj5SWIQMky5bPDA,101964
34
+ natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
35
+ natural_pdf/core/render_spec.py,sha256=rLicaS9EPyojpJcjy2Lzn5DLWQwjrFyDJyRo7jbjdGU,14505
36
36
  natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
37
37
  natural_pdf/describe/base.py,sha256=Of9WVo9XuShXoeyJr0RN2CpLhF_CeiOjazl-or53RKU,18173
38
38
  natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
39
39
  natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
40
40
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
41
41
  natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
42
- natural_pdf/elements/base.py,sha256=WUwYDzeeGNkr26lWKm8PqGlW9WQIPoteWYIpvlcxTrs,53939
43
- natural_pdf/elements/element_collection.py,sha256=uQoZ2GFCnru0LCiv5zr6wIu2IWgM0j2m44qjsJPNPbk,101340
42
+ natural_pdf/elements/base.py,sha256=aj-eXOQQlhKv9lYeUlUs9aKNcUebtG_dqxURZHZVZ58,55509
43
+ natural_pdf/elements/element_collection.py,sha256=slCUnOT04sNOTjSGgmhjcCKKPVPtdDPwU7PX1ebzGMw,101342
44
44
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
45
45
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
46
46
  natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
47
- natural_pdf/elements/region.py,sha256=ClL2vxx2aVoAecaAlkUDZ2ygvUiP8oTa-xfIclm2Eg8,155286
47
+ natural_pdf/elements/region.py,sha256=RxWidI7oNrdbuuj94SfdFXmcSDTfy89uGCeVMQvAfks,155591
48
48
  natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
49
49
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
50
50
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -100,9 +100,13 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
100
100
  natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
101
101
  natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
102
102
  natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
103
+ natural_pdf/vision/__init__.py,sha256=RymMY-3WLQBlOZ4Dx4MmL9UH6I65hNjkwUJ7ymO5JfM,287
104
+ natural_pdf/vision/mixin.py,sha256=OJwBABr74TWxP5seTKUmGj5zE9mWsBP_UKWU-Pr8V9A,8720
105
+ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs,5119
106
+ natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
103
107
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
104
108
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
105
- natural_pdf-0.2.2.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
109
+ natural_pdf-0.2.4.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
106
110
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
107
111
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
108
112
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -119,8 +123,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
119
123
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
120
124
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
121
125
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
122
- natural_pdf-0.2.2.dist-info/METADATA,sha256=uLGyhgV-iSjcvpvaj9s8ArQUzg1UTAF6bPXTf4BuZSE,6959
123
- natural_pdf-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
124
- natural_pdf-0.2.2.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
125
- natural_pdf-0.2.2.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
126
- natural_pdf-0.2.2.dist-info/RECORD,,
126
+ natural_pdf-0.2.4.dist-info/METADATA,sha256=G1tmes61GVEt6zLeDISuJZgceLQywIU-uRspGA_90Q8,6959
127
+ natural_pdf-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
128
+ natural_pdf-0.2.4.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
129
+ natural_pdf-0.2.4.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
130
+ natural_pdf-0.2.4.dist-info/RECORD,,