natural-pdf 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,8 @@ import numpy as np
7
7
  from PIL import Image
8
8
  from tqdm.auto import tqdm
9
9
 
10
+ from .template_matching import TemplateMatcher
11
+
10
12
 
11
13
  @dataclass
12
14
  class MatchCandidate:
@@ -17,7 +19,12 @@ class MatchCandidate:
17
19
  confidence: float
18
20
 
19
21
 
20
- def compute_phash(image: Image.Image, hash_size: int = 8, blur_radius: float = 0) -> int:
22
+ def compute_phash(
23
+ image: Image.Image,
24
+ hash_size: int = 8,
25
+ blur_radius: float = 0,
26
+ mask_threshold: Optional[float] = None,
27
+ ) -> int:
21
28
  """
22
29
  Compute perceptual hash of an image using DCT.
23
30
 
@@ -25,6 +32,8 @@ def compute_phash(image: Image.Image, hash_size: int = 8, blur_radius: float = 0
25
32
  image: PIL Image to hash
26
33
  hash_size: Size of the hash (8 = 64 bit hash)
27
34
  blur_radius: Optional blur to apply before hashing (makes more tolerant)
35
+ mask_threshold: If provided, pixels >= this value (0-255 scale) are replaced with median
36
+ before hashing. Useful for ignoring white backgrounds.
28
37
 
29
38
  Returns:
30
39
  Integer hash value
@@ -39,6 +48,25 @@ def compute_phash(image: Image.Image, hash_size: int = 8, blur_radius: float = 0
39
48
 
40
49
  image = image.filter(ImageFilter.GaussianBlur(radius=blur_radius))
41
50
 
51
+ # Apply masking if threshold provided
52
+ if mask_threshold is not None:
53
+ # For phash, masking works by normalizing the background
54
+ # This makes the hash focus on relative differences rather than absolute values
55
+ img_array = np.array(image, dtype=np.float32)
56
+
57
+ # Normalize by subtracting a representative background value
58
+ # Use the most common bright value as the background
59
+ bright_pixels = img_array[img_array >= mask_threshold]
60
+ if len(bright_pixels) > 0:
61
+ # Use the mode of bright pixels as background
62
+ background_val = np.median(bright_pixels)
63
+ # Normalize the image by subtracting background
64
+ # This makes different backgrounds appear similar
65
+ img_array = np.clip(img_array - background_val + 128, 0, 255)
66
+
67
+ # Convert back to PIL Image
68
+ image = Image.fromarray(img_array.astype(np.uint8))
69
+
42
70
  # Resize to 32x32 (4x the hash size for DCT)
43
71
  highfreq_factor = 4
44
72
  img_size = hash_size * highfreq_factor
@@ -80,12 +108,13 @@ def hash_similarity(hash1: int, hash2: int, hash_size: int = 64) -> float:
80
108
 
81
109
 
82
110
  class VisualMatcher:
83
- """Handles visual similarity matching using perceptual hashing"""
111
+ """Handles visual similarity matching using perceptual hashing or template matching"""
84
112
 
85
113
  def __init__(self, hash_size: int = 12):
86
114
  self.hash_size = hash_size
87
115
  self.hash_bits = hash_size * hash_size
88
116
  self._cache = {}
117
+ self.template_matcher = TemplateMatcher() # Default zncc
89
118
 
90
119
  def _get_search_scales(self, sizes: Optional[Union[float, Tuple, List]]) -> List[float]:
91
120
  """
@@ -172,20 +201,22 @@ class VisualMatcher:
172
201
  target: Image.Image,
173
202
  template_hash: Optional[int] = None,
174
203
  confidence_threshold: float = 0.6,
175
- step_factor: float = 0.1,
204
+ step: Optional[int] = None,
176
205
  sizes: Optional[Union[float, Tuple, List]] = None,
177
206
  show_progress: bool = True,
178
207
  progress_callback: Optional[Callable[[], None]] = None,
208
+ method: str = "phash",
209
+ mask_threshold: Optional[float] = None,
179
210
  ) -> List[MatchCandidate]:
180
211
  """
181
- Find all matches of template in target image using sliding window.
212
+ Find all matches of template in target image.
182
213
 
183
214
  Args:
184
215
  template: Template image to search for
185
216
  target: Target image to search in
186
- template_hash: Pre-computed hash of template (optional)
217
+ template_hash: Pre-computed hash of template (optional, only for phash)
187
218
  confidence_threshold: Minimum similarity score (0-1)
188
- step_factor: Step size as fraction of template size
219
+ step: Step size in pixels for sliding window
189
220
  sizes: Size variations to search. Can be:
190
221
  - float: ±percentage (e.g., 0.2 = 80%-120%)
191
222
  - tuple(min, max): search range with smart logarithmic steps
@@ -193,15 +224,153 @@ class VisualMatcher:
193
224
  - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
194
225
  show_progress: Show progress bar for sliding window search
195
226
  progress_callback: Optional callback function to call for each window checked
227
+ method: "phash" (default) or "template" for template matching
228
+ mask_threshold: Pixels >= this value (0-1 scale) are treated as background.
229
+ - For template matching: pixels are ignored in correlation
230
+ - For phash: background is normalized before hashing
231
+ Useful for logos/text on varying backgrounds (e.g., 0.95)
196
232
 
197
233
  Returns:
198
234
  List of MatchCandidate objects
199
235
  """
236
+ if method == "template":
237
+ # Use template matching
238
+ return self._template_match(
239
+ template,
240
+ target,
241
+ confidence_threshold,
242
+ step,
243
+ sizes,
244
+ show_progress,
245
+ progress_callback,
246
+ mask_threshold,
247
+ )
248
+ else:
249
+ # Use existing perceptual hash matching
250
+ return self._phash_match(
251
+ template,
252
+ target,
253
+ template_hash,
254
+ confidence_threshold,
255
+ step,
256
+ sizes,
257
+ show_progress,
258
+ progress_callback,
259
+ mask_threshold,
260
+ )
261
+
262
+ def _template_match(
263
+ self, template, target, threshold, step, sizes, show_progress, callback, mask_threshold
264
+ ):
265
+ """Template matching implementation"""
266
+ matches = []
267
+
268
+ template_w, template_h = template.size
269
+ target_w, target_h = target.size
270
+
271
+ # Convert to grayscale numpy arrays
272
+ target_gray = np.array(target.convert("L"), dtype=np.float32) / 255.0
273
+
274
+ # Determine scales to search
275
+ scales = self._get_search_scales(sizes)
276
+
277
+ # Default step size if not provided
278
+ if step is None:
279
+ step = 1
280
+
281
+ # Calculate total operations for progress bar
282
+ total_operations = 0
283
+ if show_progress and not callback:
284
+ for scale in scales:
285
+ scaled_w = int(template_w * scale)
286
+ scaled_h = int(template_h * scale)
287
+
288
+ if scaled_w <= target_w and scaled_h <= target_h:
289
+ # Compute score map size
290
+ out_h = (target_h - scaled_h) // step + 1
291
+ out_w = (target_w - scaled_w) // step + 1
292
+ total_operations += out_h * out_w
293
+
294
+ # Setup progress bar
295
+ progress_bar = None
296
+ if show_progress and not callback and total_operations > 0:
297
+ progress_bar = tqdm(
298
+ total=total_operations, desc="Template matching", unit="position", leave=False
299
+ )
300
+
301
+ # Search at each scale
302
+ for scale in scales:
303
+ # Resize template
304
+ scaled_w = int(template_w * scale)
305
+ scaled_h = int(template_h * scale)
306
+
307
+ if scaled_w > target_w or scaled_h > target_h:
308
+ continue
309
+
310
+ scaled_template = template.resize((scaled_w, scaled_h), Image.Resampling.LANCZOS)
311
+ template_gray = np.array(scaled_template.convert("L"), dtype=np.float32) / 255.0
312
+
313
+ # Run template matching
314
+ scores = self.template_matcher.match_template(
315
+ target_gray, template_gray, step, mask_threshold
316
+ )
317
+
318
+ # Find peaks above threshold
319
+ y_indices, x_indices = np.where(scores >= threshold)
320
+
321
+ # Update progress
322
+ if progress_bar:
323
+ progress_bar.update(scores.size)
324
+ elif callback:
325
+ for _ in range(scores.size):
326
+ callback()
327
+
328
+ for i in range(len(y_indices)):
329
+ y_idx = y_indices[i]
330
+ x_idx = x_indices[i]
331
+ score = scores[y_idx, x_idx]
332
+
333
+ # Convert back to image coordinates
334
+ x = x_idx * step
335
+ y = y_idx * step
336
+
337
+ matches.append(
338
+ MatchCandidate(
339
+ bbox=(x, y, x + scaled_w, y + scaled_h),
340
+ hash_value=0, # Not used for template matching
341
+ confidence=float(score),
342
+ )
343
+ )
344
+
345
+ # Close progress bar
346
+ if progress_bar:
347
+ progress_bar.close()
348
+
349
+ # Remove overlapping matches
350
+ return self._filter_overlapping_matches(matches)
351
+
352
+ def _phash_match(
353
+ self,
354
+ template,
355
+ target,
356
+ template_hash,
357
+ threshold,
358
+ step,
359
+ sizes,
360
+ show_progress,
361
+ callback,
362
+ mask_threshold=None,
363
+ ):
364
+ """Original perceptual hash matching"""
200
365
  matches = []
201
366
 
202
367
  # Compute template hash if not provided
203
368
  if template_hash is None:
204
- template_hash = compute_phash(template, self.hash_size)
369
+ # Convert mask threshold from 0-1 to 0-255 for PIL Image
370
+ mask_threshold_255 = int(mask_threshold * 255) if mask_threshold is not None else None
371
+ template_hash = compute_phash(
372
+ template, self.hash_size, mask_threshold=mask_threshold_255
373
+ )
205
374
 
206
375
  template_w, template_h = template.size
207
376
  target_w, target_h = target.size
@@ -209,22 +378,24 @@ class VisualMatcher:
209
378
  # Determine scales to search
210
379
  scales = self._get_search_scales(sizes)
211
380
 
381
+ # Default step size if not provided (10% of template size)
382
+ if step is None:
383
+ step = max(1, int(min(template_w, template_h) * 0.1))
384
+
212
385
  # Calculate total iterations for progress bar
213
386
  total_iterations = 0
214
- if show_progress and not progress_callback:
387
+ if show_progress and not callback:
215
388
  for scale in scales:
216
389
  scaled_w = int(template_w * scale)
217
390
  scaled_h = int(template_h * scale)
218
391
  if scaled_w <= target_w and scaled_h <= target_h:
219
- step_x = max(1, int(scaled_w * step_factor))
220
- step_y = max(1, int(scaled_h * step_factor))
221
- x_steps = len(range(0, target_w - scaled_w + 1, step_x))
222
- y_steps = len(range(0, target_h - scaled_h + 1, step_y))
392
+ x_steps = len(range(0, target_w - scaled_w + 1, step))
393
+ y_steps = len(range(0, target_h - scaled_h + 1, step))
223
394
  total_iterations += x_steps * y_steps
224
395
 
225
396
  # Setup progress bar if needed (only if no callback provided)
226
397
  progress_bar = None
227
- if show_progress and not progress_callback and total_iterations > 0:
398
+ if show_progress and not callback and total_iterations > 0:
228
399
  progress_bar = tqdm(total=total_iterations, desc="Scanning", unit="window", leave=False)
229
400
 
230
401
  # Search at each scale
@@ -236,13 +407,9 @@ class VisualMatcher:
236
407
  if scaled_w > target_w or scaled_h > target_h:
237
408
  continue
238
409
 
239
- # Calculate step size
240
- step_x = max(1, int(scaled_w * step_factor))
241
- step_y = max(1, int(scaled_h * step_factor))
242
-
243
410
  # Sliding window search
244
- for y in range(0, target_h - scaled_h + 1, step_y):
245
- for x in range(0, target_w - scaled_w + 1, step_x):
411
+ for y in range(0, target_h - scaled_h + 1, step):
412
+ for x in range(0, target_w - scaled_w + 1, step):
246
413
  # Extract window
247
414
  window = target.crop((x, y, x + scaled_w, y + scaled_h))
248
415
 
@@ -251,10 +418,15 @@ class VisualMatcher:
251
418
  window = window.resize((template_w, template_h), Image.Resampling.LANCZOS)
252
419
 
253
420
  # Compute hash and similarity
254
- window_hash = compute_phash(window, self.hash_size)
421
+ mask_threshold_255 = (
422
+ int(mask_threshold * 255) if mask_threshold is not None else None
423
+ )
424
+ window_hash = compute_phash(
425
+ window, self.hash_size, mask_threshold=mask_threshold_255
426
+ )
255
427
  similarity = hash_similarity(template_hash, window_hash, self.hash_bits)
256
428
 
257
- if similarity >= confidence_threshold:
429
+ if similarity >= threshold:
258
430
  # Convert back to target image coordinates
259
431
  bbox = (x, y, x + scaled_w, y + scaled_h)
260
432
  matches.append(MatchCandidate(bbox, window_hash, similarity))
@@ -262,8 +434,8 @@ class VisualMatcher:
262
434
  # Update progress
263
435
  if progress_bar:
264
436
  progress_bar.update(1)
265
- elif progress_callback:
266
- progress_callback()
437
+ elif callback:
438
+ callback()
267
439
 
268
440
  # Close progress bar
269
441
  if progress_bar:
@@ -0,0 +1,209 @@
1
+ """Pure NumPy template matching implementation"""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional, Tuple
5
+
6
+ import numpy as np
7
+
8
+
9
+ @dataclass
10
+ class TemplateMatch:
11
+ """Result of template matching"""
12
+
13
+ bbox: Tuple[int, int, int, int] # x0, y0, x1, y1
14
+ score: float # 0-1, higher is better
15
+
16
+
17
+ class TemplateMatcher:
18
+ """Pure NumPy template matching implementation"""
19
+
20
+ def __init__(self, method: str = "zncc"):
21
+ """
22
+ Args:
23
+ method: Matching method
24
+ - "zncc": Zero-mean Normalized Cross-Correlation (default, recommended)
25
+ - "ncc": Normalized Cross-Correlation
26
+ - "ssd": Sum of Squared Differences
27
+ """
28
+ self.method = method
29
+
30
+ def match_template(
31
+ self,
32
+ image: np.ndarray,
33
+ template: np.ndarray,
34
+ step: int = 1,
35
+ mask_threshold: Optional[float] = None,
36
+ ) -> np.ndarray:
37
+ """
38
+ Compute similarity map between image and template.
39
+
40
+ Args:
41
+ image: Target image (grayscale, normalized 0-1)
42
+ template: Template to search for (grayscale, normalized 0-1)
43
+ step: Step size for sliding window (1 = pixel perfect, >1 = faster)
44
+ mask_threshold: If provided, pixels >= this value in template are masked (ignored).
45
+ Useful for ignoring white backgrounds (e.g., 0.95 for near-white)
46
+
47
+ Returns:
48
+ 2D array of match scores
49
+ """
50
+ if self.method == "zncc":
51
+ return self._zncc(image, template, step, mask_threshold)
52
+ elif self.method == "ncc":
53
+ return self._ncc(image, template, step, mask_threshold)
54
+ elif self.method == "ssd":
55
+ return self._ssd(image, template, step, mask_threshold)
56
+ else:
57
+ # Default to zncc
58
+ return self._zncc(image, template, step, mask_threshold)
59
+
60
+ def _zncc(
61
+ self,
62
+ image: np.ndarray,
63
+ template: np.ndarray,
64
+ step: int = 1,
65
+ mask_threshold: Optional[float] = None,
66
+ ) -> np.ndarray:
67
+ """Zero-mean Normalized Cross-Correlation - most robust"""
68
+ h, w = template.shape
69
+ img_h, img_w = image.shape
70
+
71
+ out_h = (img_h - h) // step + 1
72
+ out_w = (img_w - w) // step + 1
73
+ result = np.zeros((out_h, out_w))
74
+
75
+ # Create mask if threshold provided
76
+ if mask_threshold is not None:
77
+ mask = template < mask_threshold # True for pixels to keep
78
+ if np.sum(mask) == 0:
79
+ # All pixels are masked - return zeros
80
+ return result
81
+ else:
82
+ mask = np.ones_like(template, dtype=bool)
83
+
84
+ # Precompute template statistics on non-masked pixels
85
+ masked_template = template[mask]
86
+ if len(masked_template) == 0:
87
+ return result
88
+
89
+ template_mean = np.mean(masked_template)
90
+ template_centered = np.zeros_like(template)
91
+ template_centered[mask] = template[mask] - template_mean
92
+ template_std = np.sqrt(np.sum(template_centered[mask] ** 2))
93
+
94
+ # Handle uniform template case
95
+ if template_std == 0:
96
+ # Template has no variation - fall back to checking if means match
97
+ for i in range(out_h):
98
+ for j in range(out_w):
99
+ y = i * step
100
+ x = j * step
101
+ window = image[y : y + h, x : x + w]
102
+ window_masked = window[mask]
103
+ window_mean = np.mean(window_masked)
104
+ window_std = np.std(window_masked)
105
+
106
+ # Perfect match if window also has same mean and no variation
107
+ if abs(window_mean - template_mean) < 0.01 and window_std < 0.01:
108
+ result[i, j] = 1.0
109
+ return result
110
+
111
+ for i in range(out_h):
112
+ for j in range(out_w):
113
+ y = i * step
114
+ x = j * step
115
+ window = image[y : y + h, x : x + w]
116
+
117
+ # Apply mask to window
118
+ window_masked = window[mask]
119
+ window_mean = np.mean(window_masked)
120
+ window_centered = np.zeros_like(window)
121
+ window_centered[mask] = window[mask] - window_mean
122
+ window_std = np.sqrt(np.sum(window_centered[mask] ** 2))
123
+
124
+ if window_std > 0:
125
+ correlation = np.sum(window_centered[mask] * template_centered[mask])
126
+ result[i, j] = correlation / (template_std * window_std)
127
+
128
+ return np.clip(result, -1, 1)
129
+
130
+ def _ncc(
131
+ self,
132
+ image: np.ndarray,
133
+ template: np.ndarray,
134
+ step: int = 1,
135
+ mask_threshold: Optional[float] = None,
136
+ ) -> np.ndarray:
137
+ """Normalized Cross-Correlation"""
138
+ h, w = template.shape
139
+ img_h, img_w = image.shape
140
+
141
+ out_h = (img_h - h) // step + 1
142
+ out_w = (img_w - w) // step + 1
143
+ result = np.zeros((out_h, out_w))
144
+
145
+ # Create mask if threshold provided
146
+ if mask_threshold is not None:
147
+ mask = template < mask_threshold # True for pixels to keep
148
+ if np.sum(mask) == 0:
149
+ return result
150
+ else:
151
+ mask = np.ones_like(template, dtype=bool)
152
+
153
+ template_norm = np.sqrt(np.sum(template[mask] ** 2))
154
+ if template_norm == 0:
155
+ return result
156
+
157
+ for i in range(out_h):
158
+ for j in range(out_w):
159
+ y = i * step
160
+ x = j * step
161
+ window = image[y : y + h, x : x + w]
162
+
163
+ window_norm = np.sqrt(np.sum(window[mask] ** 2))
164
+ if window_norm > 0:
165
+ correlation = np.sum(window[mask] * template[mask])
166
+ result[i, j] = correlation / (template_norm * window_norm)
167
+
168
+ return result
169
+
170
+ def _ssd(
171
+ self,
172
+ image: np.ndarray,
173
+ template: np.ndarray,
174
+ step: int = 1,
175
+ mask_threshold: Optional[float] = None,
176
+ ) -> np.ndarray:
177
+ """Sum of Squared Differences - converted to similarity score"""
178
+ h, w = template.shape
179
+ img_h, img_w = image.shape
180
+
181
+ out_h = (img_h - h) // step + 1
182
+ out_w = (img_w - w) // step + 1
183
+ result = np.zeros((out_h, out_w))
184
+
185
+ # Create mask if threshold provided
186
+ if mask_threshold is not None:
187
+ mask = template < mask_threshold # True for pixels to keep
188
+ if np.sum(mask) == 0:
189
+ return result
190
+ else:
191
+ mask = np.ones_like(template, dtype=bool)
192
+
193
+ # Number of valid pixels for normalization
194
+ n_valid = np.sum(mask)
195
+ if n_valid == 0:
196
+ return result
197
+
198
+ for i in range(out_h):
199
+ for j in range(out_w):
200
+ y = i * step
201
+ x = j * step
202
+ window = image[y : y + h, x : x + w]
203
+
204
+ # Only compute SSD on non-masked pixels
205
+ diff = window - template
206
+ ssd = np.sum((diff[mask]) ** 2) / n_valid
207
+ result[i, j] = 1.0 / (1.0 + ssd) # Convert to similarity
208
+
209
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.12
3
+ Version: 0.2.13
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -26,7 +26,7 @@ natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZD
26
26
  natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
28
  natural_pdf/core/element_manager.py,sha256=KPuKM7SstfErTkRnGq4vrgE0Tv8iazN13Jp7yAXGKso,55575
29
- natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
29
+ natural_pdf/core/highlighting_service.py,sha256=wEV-koqHoHf7S3wZ3j8D2L-ucGp3Nd0YhhStz9yqeLc,70406
30
30
  natural_pdf/core/page.py,sha256=Pid5hqVjcyX-gcCzxCJ62k6AQhNbUMNM_5QmEcylIjM,155264
31
31
  natural_pdf/core/page_collection.py,sha256=IjdFq9q0D0P6ZKWInf0H25rLzxfMb7RsUXucogkhNkU,63169
32
32
  natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
@@ -39,12 +39,12 @@ natural_pdf/describe/elements.py,sha256=3Y541z5TQ2obrfZFiFi1YQMsCt3oYrhMHpD5j1tu
39
39
  natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
40
40
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
41
41
  natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
42
- natural_pdf/elements/base.py,sha256=92ukTtRCQFsa5KvKflChCt4mt0ZGS4ecGYCQTNMO4zU,58907
42
+ natural_pdf/elements/base.py,sha256=DozTl9IS3DtSqBNArUEtHeuIiDcNWUW_gFKoUebmC4M,59573
43
43
  natural_pdf/elements/element_collection.py,sha256=idM_BUWEfbCJ5Sq0Ae_KfbVHy8TdkNfzs7iWkFe_j2I,130707
44
44
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
45
45
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
46
46
  natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
47
- natural_pdf/elements/region.py,sha256=HF6KzeuudO9upVLIrPsp3omcziLcILE3nnzl1a-LvK0,165400
47
+ natural_pdf/elements/region.py,sha256=Lf2wZgZn-C7g__eK6adgkKPjFoWbjj6A6GLnz0pn5_w,166733
48
48
  natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
49
49
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
50
50
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -101,38 +101,39 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
101
101
  natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
102
102
  natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
103
103
  natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
104
- natural_pdf/vision/__init__.py,sha256=RymMY-3WLQBlOZ4Dx4MmL9UH6I65hNjkwUJ7ymO5JfM,287
105
- natural_pdf/vision/mixin.py,sha256=OJwBABr74TWxP5seTKUmGj5zE9mWsBP_UKWU-Pr8V9A,8720
106
- natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs,5119
107
- natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
104
+ natural_pdf/vision/__init__.py,sha256=TkoQtdODlh0n_99dsjLIWKE9dgK0m4jfrui_cQ3gTwU,221
105
+ natural_pdf/vision/mixin.py,sha256=wlsX42cFUnUepZHsEfKBqXiDEPUwBG6-KN2Cx5qz_lw,10812
106
+ natural_pdf/vision/results.py,sha256=_NBRCKtDd1M3sWK7zHSym7-jpQqW4kR_iFFL4PvnBNo,6649
107
+ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2ZzU,17925
108
+ natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
108
109
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
109
110
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
110
- natural_pdf-0.2.12.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
111
+ natural_pdf-0.2.13.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
111
112
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
112
113
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
113
114
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
114
115
  optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
115
116
  optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
116
- temp/debug_cell_extraction.py,sha256=nE0Z470P40v8xZfWO1V3qgNaejs_pernEQaUOFeOJ1U,1527
117
- temp/debug_exclusion_overlap.py,sha256=RptJXwqBXy5gsvMF037KEx1o2QgjwEDkMB6TD5aJdqA,1644
118
- temp/debug_exclusions_guides.py,sha256=s8siep9te1KRJ2j0vH1tvDQnBlz7PKbHeCiYMrZL8jE,2096
119
- temp/debug_extra_guide.py,sha256=95Tim-YnmAR4kICw2XDKVDvlW5WsjK_51cv5-EV11rc,1236
120
- temp/debug_outer_boundaries.py,sha256=uJUJwojTxOU4VtbGUouuhV65IYzS6NDIVKxnS7o64nU,1456
121
- temp/debug_st_search.py,sha256=F4c_mUVi_d5AKaKIpQ0AnW1amDqAwALoQQj7wZj--J0,1021
122
117
  temp/fix_page_exclusions.py,sha256=YIj62zF38TdoBARAuSIvEbetl_JfXG-mp4v9p355qmo,1358
118
+ temp/test_draw_guides.py,sha256=_eSSBElGHQkd2QD_KA_Okw70v0dlY5m-1-C5SQwKAJw,642
119
+ temp/test_draw_guides_interactive.py,sha256=FsH-2ZQGsGx_8QfVCWUAkLbOcJz-VfiwROzQD4AD7kQ,926
123
120
  temp/test_exclusion_with_debug.py,sha256=CScxHvb43KrB5dzXuTOhuzjcBXZBdfYB5ygiKkEW26g,1393
124
121
  temp/test_find_exclusions_fix.py,sha256=1l5aEqnElcl3kiykdtmJFlVxQ1xMKGm1UckGYEQg--c,2103
125
122
  temp/test_find_exclusions_fix_no_recursion.py,sha256=qZspTBwxunRM93N_-fZ2fR5Lodj0ArQX3h10HlTXhfc,3592
126
123
  temp/test_fix_real_pdf.py,sha256=uuylxmpeAEbIix9wjl0Gri1sZlN61dBWTq6ZCyfvzF8,1454
127
124
  temp/test_fix_working.py,sha256=-Ryre1rXYA2EG_lmPZGYEGi8yz0slhHEXPJMYexZW84,1750
128
125
  temp/test_fixed_pdf_exclusions.py,sha256=Q5zxooKDvtTXo-dDsx3nsQw1ZVHX3TW47iZ_dXpFdrY,2168
126
+ temp/test_guide_draw_notebook.py,sha256=9yYRV5mfmVHiL1lnwNj-vksw45d1oWbAZpDGA7yZf-M,1583
129
127
  temp/test_horizontal_top_bottom.py,sha256=Mb3tjt9Z3wOTpzFOgK7i0K-j-_ynNh4vDu2x1L3nu-s,2163
128
+ temp/test_inline_js.py,sha256=xuQH8VQn7L4sogv6wd_Rwudx5p_Lt6we1h7U1LPTH-g,646
130
129
  temp/test_marker_order.py,sha256=TFZkMxRiNoZGVcdDivYnkIDNvwHaiyKUdYoy2rTTIiI,1417
131
130
  temp/test_original_exclusions_now_work.py,sha256=G6LmaF-P9Qhj0j4lT_4ncfCddllfP6L8F_x2prUBr9w,1904
132
131
  temp/test_pdf_exclusions_with_guides.py,sha256=QaMl0frgKC8kCPQ2BUI8kqyvqsIjQPXKV_St1rK3zxg,2754
133
132
  temp/test_region_exclusions_detailed.py,sha256=EftdW3JY3JH_LX5QlWKt-4drM-joPggK2fKUZRXVTMA,814
134
133
  temp/test_stripes_real_pdf.py,sha256=FIvDoJrnuioOMw1A0aTCCfZLeg99lusfe0Fb0MiqnhQ,2618
135
134
  temp/test_vertical_stripes.py,sha256=Yf3TJfb_faqAFzlgb7i5u6dDHjF4UMSHIGM99vangRk,1877
135
+ temp/test_widget_functionality.py,sha256=jsEGHYK1dWWa8uEcfGRRj1ReHRMzNoIaMZU4d-o-Djs,2448
136
+ temp/test_widget_simple.py,sha256=Vy_DKgPhPhUQ8nKw_KnhGTpwtmh5EEic0avEyW9hbOQ,1398
136
137
  tools/bad_pdf_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
138
  tools/bad_pdf_eval/analyser.py,sha256=oqSTo3NLyignp_XdCO9_SRCUUXMU8lfgDavKYZYNxws,13690
138
139
  tools/bad_pdf_eval/collate_summaries.py,sha256=L_YsdiqmwGIHYWTVJqo6gyazyn3GIQgpfGGKk8uwckk,5159
@@ -144,8 +145,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
144
145
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
145
146
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
146
147
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
147
- natural_pdf-0.2.12.dist-info/METADATA,sha256=jRNM0JxYvPDuqzD63earjbaUwQgXCjPYPLC5pLl49Uk,6960
148
- natural_pdf-0.2.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
149
- natural_pdf-0.2.12.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
150
- natural_pdf-0.2.12.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
151
- natural_pdf-0.2.12.dist-info/RECORD,,
148
+ natural_pdf-0.2.13.dist-info/METADATA,sha256=k3WrL3HrPJRbK8Bu5PVIkNlJImAh5N8KC1M_7rZc2WM,6960
149
+ natural_pdf-0.2.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
150
+ natural_pdf-0.2.13.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
151
+ natural_pdf-0.2.13.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
152
+ natural_pdf-0.2.13.dist-info/RECORD,,
@@ -0,0 +1,25 @@
1
+ """Example usage of the interactive guide drawing feature"""
2
+
3
+ # In a Jupyter notebook:
4
+ from natural_pdf import NaturalPDF
5
+
6
+ # Load a PDF
7
+ pdf = NaturalPDF.from_file("your_pdf.pdf")
8
+ page = pdf[0]
9
+
10
+ # Create guides
11
+ guides = page.guides()
12
+
13
+ # Detect some initial guides (optional)
14
+ guides.vertical.from_lines(n=5)
15
+ guides.horizontal.from_lines(n=5)
16
+
17
+ # Open interactive editor for vertical guides
18
+ guides.vertical.draw()
19
+
20
+ # Open interactive editor for horizontal guides
21
+ guides.horizontal.draw(width=600) # Smaller widget
22
+
23
+ # After editing, the guides are automatically updated
24
+ # You can now use them to extract tables:
25
+ table = page.extract_table(guides)