natural-pdf 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -78,6 +78,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
78
78
 
79
79
  # # Import new utils
80
80
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
81
+ from natural_pdf.vision.mixin import VisualSearchMixin
81
82
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
82
83
 
83
84
  # --- End Classification Imports --- #
@@ -101,6 +102,7 @@ class Page(
101
102
  ExtractionMixin,
102
103
  ShapeDetectionMixin,
103
104
  DescribeMixin,
105
+ VisualSearchMixin,
104
106
  Visualizable,
105
107
  ):
106
108
  """Enhanced Page wrapper built on top of pdfplumber.Page.
natural_pdf/core/pdf.py CHANGED
@@ -42,6 +42,7 @@ from natural_pdf.ocr import OCRManager, OCROptions
42
42
  from natural_pdf.selectors.parser import parse_selector
43
43
  from natural_pdf.text_mixin import TextMixin
44
44
  from natural_pdf.utils.locks import pdf_render_lock
45
+ from natural_pdf.vision.mixin import VisualSearchMixin
45
46
 
46
47
  if TYPE_CHECKING:
47
48
  from natural_pdf.elements.element_collection import ElementCollection
@@ -252,7 +253,9 @@ class _LazyPageList(Sequence):
252
253
  # --- End Lazy Page List Helper --- #
253
254
 
254
255
 
255
- class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, Visualizable):
256
+ class PDF(
257
+ TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, VisualSearchMixin, Visualizable
258
+ ):
256
259
  """Enhanced PDF wrapper built on top of pdfplumber.
257
260
 
258
261
  This class provides a fluent interface for working with PDF documents,
@@ -40,6 +40,7 @@ logger = logging.getLogger(__name__)
40
40
  from natural_pdf.core.pdf import PDF
41
41
  from natural_pdf.elements.region import Region
42
42
  from natural_pdf.export.mixin import ExportMixin
43
+ from natural_pdf.vision.mixin import VisualSearchMixin
43
44
 
44
45
  # --- Search Imports ---
45
46
  try:
@@ -69,8 +70,8 @@ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the ne
69
70
 
70
71
 
71
72
  class PDFCollection(
72
- SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin
73
- ): # Add ExportMixin and ShapeDetectionMixin
73
+ SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin, VisualSearchMixin
74
+ ):
74
75
  def __init__(
75
76
  self,
76
77
  source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -258,8 +259,6 @@ class PDFCollection(
258
259
  return iter(self._pdfs)
259
260
 
260
261
  def __repr__(self) -> str:
261
- # Removed search status
262
- return f"<PDFCollection(count={len(self._pdfs)})>"
263
262
  return f"<PDFCollection(count={len(self._pdfs)})>"
264
263
 
265
264
  @property
@@ -267,6 +266,134 @@ class PDFCollection(
267
266
  """Returns the list of PDF objects held by the collection."""
268
267
  return self._pdfs
269
268
 
269
+ def show(self, limit: Optional[int] = 30, per_pdf_limit: Optional[int] = 10, **kwargs):
270
+ """
271
+ Display all PDFs in the collection with labels.
272
+
273
+ Each PDF is shown with its pages in a grid layout (6 columns by default),
274
+ and all PDFs are stacked vertically with labels.
275
+
276
+ Args:
277
+ limit: Maximum total pages to show across all PDFs (default: 30)
278
+ per_pdf_limit: Maximum pages to show per PDF (default: 10)
279
+ **kwargs: Additional arguments passed to each PDF's show() method
280
+ (e.g., columns, exclusions, resolution, etc.)
281
+
282
+ Returns:
283
+ Displayed image in Jupyter or None
284
+ """
285
+ if not self._pdfs:
286
+ print("Empty collection")
287
+ return None
288
+
289
+ # Import here to avoid circular imports
290
+ import numpy as np
291
+ from PIL import Image, ImageDraw, ImageFont
292
+
293
+ # Calculate pages per PDF if total limit is set
294
+ if limit and not per_pdf_limit:
295
+ per_pdf_limit = max(1, limit // len(self._pdfs))
296
+
297
+ # Collect images from each PDF
298
+ all_images = []
299
+ total_pages_shown = 0
300
+
301
+ for pdf in self._pdfs:
302
+ if limit and total_pages_shown >= limit:
303
+ break
304
+
305
+ # Calculate limit for this PDF
306
+ pdf_limit = per_pdf_limit
307
+ if limit:
308
+ remaining = limit - total_pages_shown
309
+ pdf_limit = min(per_pdf_limit or remaining, remaining)
310
+
311
+ # Get PDF identifier
312
+ pdf_name = getattr(pdf, "filename", None) or getattr(pdf, "path", "Unknown")
313
+ if isinstance(pdf_name, Path):
314
+ pdf_name = pdf_name.name
315
+ elif "/" in str(pdf_name):
316
+ pdf_name = str(pdf_name).split("/")[-1]
317
+
318
+ # Render this PDF
319
+ try:
320
+ # Get render specs from the PDF
321
+ render_specs = pdf._get_render_specs(mode="show", max_pages=pdf_limit, **kwargs)
322
+
323
+ if not render_specs:
324
+ continue
325
+
326
+ # Get the highlighter and render without displaying
327
+ highlighter = pdf._get_highlighter()
328
+ pdf_image = highlighter.unified_render(
329
+ specs=render_specs,
330
+ layout="grid" if len(render_specs) > 1 else "single",
331
+ columns=6,
332
+ **kwargs,
333
+ )
334
+
335
+ if pdf_image:
336
+ # Add label above the PDF image
337
+ label_height = 40
338
+ label_bg_color = (240, 240, 240)
339
+ label_text_color = (0, 0, 0)
340
+
341
+ # Create new image with space for label
342
+ width, height = pdf_image.size
343
+ labeled_image = Image.new("RGB", (width, height + label_height), "white")
344
+
345
+ # Draw label background
346
+ draw = ImageDraw.Draw(labeled_image)
347
+ draw.rectangle([0, 0, width, label_height], fill=label_bg_color)
348
+
349
+ # Draw label text
350
+ try:
351
+ # Try to use a nice font if available
352
+ font = ImageFont.truetype("Arial", 20)
353
+ except:
354
+ # Fallback to default font
355
+ font = ImageFont.load_default()
356
+
357
+ label_text = f"{pdf_name} ({len(pdf.pages)} pages)"
358
+ draw.text((10, 10), label_text, fill=label_text_color, font=font)
359
+
360
+ # Paste PDF image below label
361
+ labeled_image.paste(pdf_image, (0, label_height))
362
+
363
+ all_images.append(labeled_image)
364
+ total_pages_shown += min(pdf_limit, len(pdf.pages))
365
+
366
+ except Exception as e:
367
+ logger.warning(f"Failed to render PDF {pdf_name}: {e}")
368
+ continue
369
+
370
+ if not all_images:
371
+ print("No PDFs could be rendered")
372
+ return None
373
+
374
+ # Combine all images vertically
375
+ if len(all_images) == 1:
376
+ combined = all_images[0]
377
+ else:
378
+ # Add spacing between PDFs
379
+ spacing = 20
380
+ total_height = sum(img.height for img in all_images) + spacing * (len(all_images) - 1)
381
+ max_width = max(img.width for img in all_images)
382
+
383
+ combined = Image.new("RGB", (max_width, total_height), "white")
384
+
385
+ y_offset = 0
386
+ for i, img in enumerate(all_images):
387
+ # Center images if they're narrower than max width
388
+ x_offset = (max_width - img.width) // 2
389
+ combined.paste(img, (x_offset, y_offset))
390
+ y_offset += img.height
391
+ if i < len(all_images) - 1:
392
+ y_offset += spacing
393
+
394
+ # Return the combined image (Jupyter will display it automatically)
395
+ return combined
396
+
270
397
  @overload
271
398
  def find_all(
272
399
  self,
@@ -186,7 +186,7 @@ class Visualizable:
186
186
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
187
187
  labels: bool = True,
188
188
  label_format: Optional[str] = None,
189
- highlights: Optional[List[Dict[str, Any]]] = None,
189
+ highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
190
190
  legend_position: str = "right",
191
191
  annotate: Optional[Union[str, List[str]]] = None,
192
192
  # Layout options for multi-page/region
@@ -211,7 +211,7 @@ class Visualizable:
211
211
  color: Default highlight color
212
212
  labels: Whether to show labels for highlights
213
213
  label_format: Format string for labels (e.g., "Element {index}")
214
- highlights: Additional highlight groups to show
214
+ highlights: Additional highlight groups to show, or False to disable all highlights
215
215
  legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
216
216
  annotate: Attribute name(s) to display on highlights (string or list)
217
217
  layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)
@@ -1192,7 +1192,7 @@ class Element(
1192
1192
  self,
1193
1193
  mode: Literal["show", "render"] = "show",
1194
1194
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
1195
- highlights: Optional[List[Dict[str, Any]]] = None,
1195
+ highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
1196
1196
  crop: Union[bool, Literal["content"]] = False,
1197
1197
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
1198
1198
  label: Optional[str] = None,
@@ -1203,7 +1203,7 @@ class Element(
1203
1203
  Args:
1204
1204
  mode: Rendering mode - 'show' includes highlights, 'render' is clean
1205
1205
  color: Color for highlighting this element in show mode
1206
- highlights: Additional highlight groups to show
1206
+ highlights: Additional highlight groups to show, or False to disable all highlights
1207
1207
  crop: Whether to crop to element bounds
1208
1208
  crop_bbox: Explicit crop bounds
1209
1209
  label: Optional label for this element
@@ -1225,19 +1225,23 @@ class Element(
1225
1225
  if hasattr(self, "bbox") and self.bbox:
1226
1226
  spec.crop_bbox = self.bbox
1227
1227
 
1228
- # Add highlight in show mode
1229
- if mode == "show":
1230
- # Use provided label or generate one
1231
- element_label = label if label is not None else self.__class__.__name__
1232
-
1233
- spec.add_highlight(
1234
- element=self,
1235
- color=color or "red", # Default red for single element
1236
- label=element_label,
1237
- )
1228
+ # Add highlight in show mode (unless explicitly disabled with highlights=False)
1229
+ if mode == "show" and highlights is not False:
1230
+ # Only highlight this element if:
1231
+ # 1. We're not cropping, OR
1232
+ # 2. We're cropping but color was explicitly specified
1233
+ if not crop or color is not None:
1234
+ # Use provided label or generate one
1235
+ element_label = label if label is not None else self.__class__.__name__
1236
+
1237
+ spec.add_highlight(
1238
+ element=self,
1239
+ color=color or "red", # Default red for single element
1240
+ label=element_label,
1241
+ )
1238
1242
 
1239
- # Add additional highlight groups if provided
1240
- if highlights:
1243
+ # Add additional highlight groups if provided (and highlights is a list)
1244
+ if highlights and isinstance(highlights, list):
1241
1245
  for group in highlights:
1242
1246
  group_elements = group.get("elements", [])
1243
1247
  group_color = group.get("color", color)
@@ -221,7 +221,7 @@ class Region(
221
221
  self,
222
222
  mode: Literal["show", "render"] = "show",
223
223
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
224
- highlights: Optional[List[Dict[str, Any]]] = None,
224
+ highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
225
225
  crop: Union[bool, Literal["content"]] = True, # Default to True for regions
226
226
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
227
227
  **kwargs,
@@ -231,7 +231,7 @@ class Region(
231
231
  Args:
232
232
  mode: Rendering mode - 'show' includes highlights, 'render' is clean
233
233
  color: Color for highlighting this region in show mode
234
- highlights: Additional highlight groups to show
234
+ highlights: Additional highlight groups to show, or False to disable all highlights
235
235
  crop: Whether to crop to this region
236
236
  crop_bbox: Explicit crop bounds (overrides region bounds)
237
237
  **kwargs: Additional parameters
@@ -250,10 +250,12 @@ class Region(
250
250
  # Crop to this region's bounds
251
251
  spec.crop_bbox = self.bbox
252
252
 
253
- # Add highlights in show mode
254
- if mode == "show":
255
- # Highlight this region
256
- if color or mode == "show": # Always highlight in show mode
253
+ # Add highlights in show mode (unless explicitly disabled with highlights=False)
254
+ if mode == "show" and highlights is not False:
255
+ # Only highlight this region if:
256
+ # 1. We're not cropping, OR
257
+ # 2. We're cropping but color was explicitly specified
258
+ if not crop or color is not None:
257
259
  spec.add_highlight(
258
260
  bbox=self.bbox,
259
261
  polygon=self.polygon if self.has_polygon else None,
@@ -261,8 +263,8 @@ class Region(
261
263
  label=self.label or self.name or "Region",
262
264
  )
263
265
 
264
- # Add additional highlight groups if provided
265
- if highlights:
266
+ # Add additional highlight groups if provided (and highlights is a list)
267
+ if highlights and isinstance(highlights, list):
266
268
  for group in highlights:
267
269
  elements = group.get("elements", [])
268
270
  group_color = group.get("color", color)
@@ -0,0 +1,7 @@
1
+ """Vision module for visual similarity and pattern matching"""
2
+
3
+ from .mixin import VisualSearchMixin
4
+ from .results import Match, MatchResults
5
+ from .similarity import VisualMatcher, compute_phash
6
+
7
+ __all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]
@@ -0,0 +1,209 @@
1
+ """Mixin to add visual similarity search to Page/PDF/PDFCollection"""
2
+
3
+ from typing import List, Optional, Tuple, Union
4
+
5
+ import numpy as np
6
+ from PIL import Image
7
+ from tqdm.auto import tqdm
8
+
9
+ from .results import Match, MatchResults
10
+ from .similarity import VisualMatcher, compute_phash
11
+
12
+
13
+ class VisualSearchMixin:
14
+ """Add find_similar method to classes that include this mixin"""
15
+
16
+ def find_similar(
17
+ self,
18
+ examples: Union["Element", "Region", List[Union["Element", "Region"]]],
19
+ using: str = "vision",
20
+ confidence: float = 0.6,
21
+ sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
22
+ resolution: int = 72,
23
+ hash_size: int = 20,
24
+ step_factor: float = 0.1,
25
+ max_per_page: Optional[int] = None,
26
+ show_progress: bool = True,
27
+ **kwargs,
28
+ ) -> MatchResults:
29
+ """
30
+ Find regions visually similar to the given example(s).
31
+
32
+ Args:
33
+ examples: Single element/region or list of examples to search for
34
+ using: Search method - currently only 'vision' is supported
35
+ confidence: Minimum similarity score (0-1)
36
+ sizes: Size variations to search. Can be:
37
+ - float: ±percentage (e.g., 0.2 = 80%-120%)
38
+ - tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
39
+ - tuple(min, max, step): explicit step size
40
+ - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
41
+ resolution: Resolution for image comparison (DPI) (default: 72)
42
+ hash_size: Size of perceptual hash grid (default: 12)
43
+ step_factor: Step size as fraction of template size (default: 0.1)
44
+ max_per_page: Maximum matches to return per page
45
+ show_progress: Show progress bar for multi-page searches (default: True)
46
+ **kwargs: Additional options
47
+
48
+ Returns:
49
+ MatchResults collection
50
+ """
51
+ if using != "vision":
52
+ raise NotImplementedError(f"using='{using}' not yet supported")
53
+
54
+ # Ensure examples is a list
55
+ if not isinstance(examples, list):
56
+ examples = [examples]
57
+
58
+ # Initialize matcher with specified hash size
59
+ matcher = VisualMatcher(hash_size=hash_size)
60
+
61
+ # Prepare templates
62
+ templates = []
63
+ for example in examples:
64
+ # Render the example region/element
65
+ example_image = example.render(resolution=resolution, crop=True)
66
+ template_hash = compute_phash(example_image, hash_size=hash_size)
67
+ templates.append({"image": example_image, "hash": template_hash, "source": example})
68
+
69
+ # Get pages to search based on the object type
70
+ if hasattr(self, "__class__") and self.__class__.__name__ == "PDFCollection":
71
+ # PDFCollection needs to iterate through all PDFs
72
+ pages_to_search = []
73
+ for pdf in self:
74
+ pages_to_search.extend(pdf.pages)
75
+ elif hasattr(self, "pages"): # PDF
76
+ pages_to_search = self.pages
77
+ elif hasattr(self, "number"): # Single page
78
+ pages_to_search = [self]
79
+ else:
80
+ raise TypeError(f"Cannot search in {type(self)}")
81
+
82
+ # Calculate total operations for progress bar
83
+ total_operations = 0
84
+ if show_progress:
85
+ # Get scales that will be searched
86
+ scales = matcher._get_search_scales(sizes)
87
+
88
+ # Pre-calculate for all pages and templates
89
+ for page in pages_to_search:
90
+ # Estimate page image size
91
+ page_w = int(page.width * resolution / 72.0)
92
+ page_h = int(page.height * resolution / 72.0)
93
+
94
+ for template_data in templates:
95
+ template_w, template_h = template_data["image"].size
96
+
97
+ for scale in scales:
98
+ scaled_w = int(template_w * scale)
99
+ scaled_h = int(template_h * scale)
100
+
101
+ if scaled_w <= page_w and scaled_h <= page_h:
102
+ step_x = max(1, int(scaled_w * step_factor))
103
+ step_y = max(1, int(scaled_h * step_factor))
104
+
105
+ x_windows = len(range(0, page_w - scaled_w + 1, step_x))
106
+ y_windows = len(range(0, page_h - scaled_h + 1, step_y))
107
+ total_operations += x_windows * y_windows
108
+
109
+ # Search each page
110
+ all_matches = []
111
+
112
+ # Create single progress bar for all operations
113
+ progress_bar = None
114
+ operations_done = 0
115
+ last_update = 0
116
+ update_frequency = max(1, total_operations // 1000) # Update at most 1000 times
117
+
118
+ if show_progress and total_operations > 0:
119
+ progress_bar = tqdm(
120
+ total=total_operations,
121
+ desc="Searching",
122
+ unit="window",
123
+ miniters=update_frequency, # Minimum iterations between updates
124
+ mininterval=0.1, # Minimum time between updates (seconds)
125
+ )
126
+
127
+ for page_idx, page in enumerate(pages_to_search):
128
+ # Render the full page once
129
+ page_image = page.render(resolution=resolution)
130
+
131
+ # Convert page coordinates to image coordinates
132
+ scale = resolution / 72.0 # PDF is 72 DPI
133
+
134
+ page_matches = []
135
+
136
+ # Search for each template
137
+ for template_idx, template_data in enumerate(templates):
138
+ template_image = template_data["image"]
139
+ template_hash = template_data["hash"]
140
+
141
+ # Custom progress callback to update our main progress bar
142
+ def update_progress():
143
+ nonlocal operations_done, last_update
144
+ operations_done += 1
145
+
146
+ # Only update progress bar every N operations to avoid overwhelming output
147
+ if progress_bar and (
148
+ operations_done - last_update >= update_frequency
149
+ or operations_done == total_operations
150
+ ):
151
+ progress_bar.update(operations_done - last_update)
152
+ last_update = operations_done
153
+
154
+ # Update description with current page/template info
155
+ if len(pages_to_search) > 1:
156
+ progress_bar.set_description(
157
+ f"Page {page.number}/{len(pages_to_search)}"
158
+ )
159
+ elif len(templates) > 1:
160
+ progress_bar.set_description(
161
+ f"Template {template_idx + 1}/{len(templates)}"
162
+ )
163
+
164
+ # Find matches in this page - never show internal progress
165
+ candidates = matcher.find_matches_in_image(
166
+ template_image,
167
+ page_image,
168
+ template_hash=template_hash,
169
+ confidence_threshold=confidence,
170
+ sizes=sizes,
171
+ step_factor=step_factor,
172
+ show_progress=False, # We handle progress ourselves
173
+ progress_callback=update_progress if progress_bar else None,
174
+ **kwargs,
175
+ )
176
+
177
+ # Convert image coordinates back to PDF coordinates
178
+ for candidate in candidates:
179
+ img_x0, img_y0, img_x1, img_y1 = candidate.bbox
180
+
181
+ # Convert from image pixels to PDF points
182
+ # No flipping needed! PDF coordinates map directly to PIL coordinates
183
+ pdf_x0 = img_x0 / scale
184
+ pdf_y0 = img_y0 / scale
185
+ pdf_x1 = img_x1 / scale
186
+ pdf_y1 = img_y1 / scale
187
+
188
+ # Create Match object
189
+ match = Match(
190
+ page=page,
191
+ bbox=(pdf_x0, pdf_y0, pdf_x1, pdf_y1),
192
+ confidence=candidate.confidence,
193
+ source_example=template_data["source"],
194
+ )
195
+ page_matches.append(match)
196
+
197
+ # Apply max_per_page limit if specified
198
+ if max_per_page and len(page_matches) > max_per_page:
199
+ # Sort by confidence and take top N
200
+ page_matches.sort(key=lambda m: m.confidence, reverse=True)
201
+ page_matches = page_matches[:max_per_page]
202
+
203
+ all_matches.extend(page_matches)
204
+
205
+ # Close progress bar
206
+ if progress_bar:
207
+ progress_bar.close()
208
+
209
+ return MatchResults(all_matches)
@@ -0,0 +1,146 @@
1
+ """Match results for visual similarity search"""
2
+
3
+ from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple
4
+
5
+ # Import Region directly as it's a base class
6
+ from natural_pdf.elements.region import Region
7
+
8
+ if TYPE_CHECKING:
9
+ from natural_pdf.core.page_collection import PageCollection
10
+ from natural_pdf.elements.element_collection import ElementCollection
11
+
12
+
13
+ class Match(Region):
14
+ """A region that was found via visual similarity search"""
15
+
16
+ def __init__(self, page, bbox, confidence, source_example=None, metadata=None):
17
+ """
18
+ Initialize a Match object.
19
+
20
+ Args:
21
+ page: Page containing the match
22
+ bbox: Bounding box of the match
23
+ confidence: Similarity confidence (0-1)
24
+ source_example: The example/template that led to this match
25
+ metadata: Additional metadata about the match
26
+ """
27
+ super().__init__(page, bbox)
28
+ self.confidence = confidence
29
+ self.source_example = source_example
30
+ self.metadata = metadata or {}
31
+
32
+ @property
33
+ def pdf(self):
34
+ """Get the PDF containing this match"""
35
+ return self.page.pdf
36
+
37
+ def __repr__(self):
38
+ return f"<Match page={self.page.number} confidence={self.confidence:.2f} bbox={self.bbox}>"
39
+
40
+
41
+ class MatchResults:
42
+ """Collection of Match objects with transformation methods"""
43
+
44
+ def __init__(self, matches: List[Match]):
45
+ """Initialize with list of Match objects"""
46
+ # Import here to avoid circular import
47
+ from natural_pdf.elements.element_collection import ElementCollection
48
+
49
+ # Create a base ElementCollection
50
+ self._collection = ElementCollection(matches)
51
+ self._matches = matches
52
+
53
+ def __len__(self):
54
+ return len(self._matches)
55
+
56
+ def __iter__(self):
57
+ return iter(self._matches)
58
+
59
+ def __getitem__(self, key):
60
+ return self._matches[key]
61
+
62
+ def filter(self, filter_func) -> "MatchResults":
63
+ """Filter matches by a function"""
64
+ filtered = [m for m in self if filter_func(m)]
65
+ return MatchResults(filtered)
66
+
67
+ def filter_by_confidence(self, min_confidence: float) -> "MatchResults":
68
+ """Filter matches by minimum confidence"""
69
+ return self.filter(lambda m: m.confidence >= min_confidence)
70
+
71
+ def pages(self):
72
+ """Get unique pages containing matches"""
73
+ # Import here to avoid circular import
74
+ from natural_pdf.core.page_collection import PageCollection
75
+
76
+ # Get unique pages while preserving order
77
+ seen = set()
78
+ unique_pages = []
79
+ for match in self:
80
+ if match.page not in seen:
81
+ seen.add(match.page)
82
+ unique_pages.append(match.page)
83
+
84
+ # Attach matches to each page
85
+ for page in unique_pages:
86
+ page._matches = MatchResults([m for m in self if m.page == page])
87
+
88
+ return PageCollection(unique_pages)
89
+
90
+ def pdfs(self):
91
+ """Get unique PDFs containing matches"""
92
+ # Import here to avoid circular import
93
+ from natural_pdf.core.pdf_collection import PDFCollection
94
+
95
+ # Get unique PDFs while preserving order
96
+ seen = set()
97
+ unique_pdfs = []
98
+ for match in self:
99
+ if match.pdf not in seen:
100
+ seen.add(match.pdf)
101
+ unique_pdfs.append(match.pdf)
102
+
103
+ # Attach matches to each PDF
104
+ for pdf in unique_pdfs:
105
+ pdf._matches = MatchResults([m for m in self if m.pdf == pdf])
106
+
107
+ return PDFCollection(unique_pdfs)
108
+
109
+ def group_by_page(self) -> Iterator[Tuple[Any, "MatchResults"]]:
110
+ """Group matches by page"""
111
+ from itertools import groupby
112
+
113
+ # Sort by PDF filename and page number
114
+ sorted_matches = sorted(self, key=lambda m: (getattr(m.pdf, "filename", ""), m.page.number))
115
+
116
+ for page, matches in groupby(sorted_matches, key=lambda m: m.page):
117
+ yield page, MatchResults(list(matches))
118
+
119
+ def sort_by_confidence(self, descending: bool = True) -> "MatchResults":
120
+ """Sort matches by confidence score"""
121
+ sorted_matches = sorted(self, key=lambda m: m.confidence, reverse=descending)
122
+ return MatchResults(sorted_matches)
123
+
124
+ def regions(self):
125
+ """Get all matches as an ElementCollection of regions"""
126
+ # Import here to avoid circular import
127
+ from natural_pdf.elements.element_collection import ElementCollection
128
+
129
+ # Matches are already Region objects, so just wrap them
130
+ return ElementCollection(list(self))
131
+
132
+ def show(self, **kwargs):
133
+ """Show all matches using ElementCollection.show()"""
134
+ # Get regions and show them
135
+ return self.regions().show(**kwargs)
136
+
137
+ def __repr__(self):
138
+ if len(self) == 0:
139
+ return "<MatchResults: empty>"
140
+ elif len(self) == 1:
141
+ return f"<MatchResults: 1 match>"
142
+ else:
143
+ conf_range = (
144
+ f"{min(m.confidence for m in self):.2f}-{max(m.confidence for m in self):.2f}"
145
+ )
146
+ return f"<MatchResults: {len(self)} matches, confidence {conf_range}>"
@@ -0,0 +1,321 @@
1
+ """Visual similarity matching using perceptual hashing"""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Callable, List, Optional, Tuple, Union
5
+
6
+ import numpy as np
7
+ from PIL import Image
8
+ from tqdm.auto import tqdm
9
+
10
+
11
+ @dataclass
12
+ class MatchCandidate:
13
+ """Candidate match during sliding window search"""
14
+
15
+ bbox: Tuple[float, float, float, float]
16
+ hash_value: int
17
+ confidence: float
18
+
19
+
20
+ def compute_phash(image: Image.Image, hash_size: int = 8, blur_radius: float = 0) -> int:
21
+ """
22
+ Compute perceptual hash of an image using DCT.
23
+
24
+ Args:
25
+ image: PIL Image to hash
26
+ hash_size: Size of the hash (8 = 64 bit hash)
27
+ blur_radius: Optional blur to apply before hashing (makes more tolerant)
28
+
29
+ Returns:
30
+ Integer hash value
31
+ """
32
+ # Convert to grayscale
33
+ if image.mode != "L":
34
+ image = image.convert("L")
35
+
36
+ # Optional blur to reduce sensitivity to minor variations
37
+ if blur_radius > 0:
38
+ from PIL import ImageFilter
39
+
40
+ image = image.filter(ImageFilter.GaussianBlur(radius=blur_radius))
41
+
42
+ # Resize to 32x32 (4x the hash size for DCT)
43
+ highfreq_factor = 4
44
+ img_size = hash_size * highfreq_factor
45
+ image = image.resize((img_size, img_size), Image.Resampling.LANCZOS)
46
+
47
+ # Convert to numpy array
48
+ pixels = np.array(image, dtype=np.float32)
49
+
50
+ # Apply DCT
51
+ from scipy.fftpack import dct
52
+
53
+ dct_coef = dct(dct(pixels, axis=0), axis=1)
54
+
55
+ # Keep top-left 8x8 (low frequencies)
56
+ dct_low = dct_coef[:hash_size, :hash_size]
57
+
58
+ # Compute median excluding the DC component
59
+ dct_low_no_dc = dct_low.flatten()[1:] # Skip first element (DC)
60
+ median = np.median(dct_low_no_dc)
61
+
62
+ # Create binary hash
63
+ diff = dct_low.flatten() > median
64
+
65
+ # Convert to integer
66
+ return sum(2**i for i, v in enumerate(diff) if v)
67
+
68
+
69
+ def hamming_distance(hash1: int, hash2: int, hash_size: int = 64) -> int:
70
+ """Calculate Hamming distance between two hashes"""
71
+ # XOR and count set bits
72
+ xor = hash1 ^ hash2
73
+ return bin(xor).count("1")
74
+
75
+
76
+ def hash_similarity(hash1: int, hash2: int, hash_size: int = 64) -> float:
77
+ """Calculate similarity score between two hashes (0-1)"""
78
+ distance = hamming_distance(hash1, hash2, hash_size)
79
+ return 1.0 - (distance / hash_size)
80
+
81
+
82
+ class VisualMatcher:
83
+ """Handles visual similarity matching using perceptual hashing"""
84
+
85
+ def __init__(self, hash_size: int = 12):
86
+ self.hash_size = hash_size
87
+ self.hash_bits = hash_size * hash_size
88
+ self._cache = {}
89
+
90
+ def _get_search_scales(self, sizes: Optional[Union[float, Tuple, List]]) -> List[float]:
91
+ """
92
+ Convert various size input formats to a list of scales to search.
93
+
94
+ Args:
95
+ sizes: Can be:
96
+ - None: just 1.0
97
+ - float: ±percentage (e.g., 0.2 = 80%-120%)
98
+ - tuple(min, max): range with smart logarithmic steps
99
+ - tuple(min, max, step): explicit step size
100
+ - list: exact sizes to use
101
+
102
+ Returns:
103
+ List of scale factors to search
104
+ """
105
+ if sizes is None:
106
+ return [1.0]
107
+
108
+ # List of exact sizes
109
+ if isinstance(sizes, list):
110
+ return sorted(sizes)
111
+
112
+ # Single float: ±percentage
113
+ if isinstance(sizes, (int, float)):
114
+ if sizes <= 0:
115
+ return [1.0]
116
+ # Convert to min/max range
117
+ min_scale = max(0.1, 1.0 - sizes)
118
+ max_scale = 1.0 + sizes
119
+ # Use tuple logic below
120
+ sizes = (min_scale, max_scale)
121
+
122
+ # Tuple handling
123
+ if isinstance(sizes, tuple):
124
+ if len(sizes) == 2:
125
+ min_scale, max_scale = sizes
126
+ if min_scale >= max_scale:
127
+ return [min_scale]
128
+
129
+ # Smart defaults with logarithmic spacing
130
+ # Calculate range ratio to determine number of steps
131
+ ratio = max_scale / min_scale
132
+
133
+ if ratio <= 1.5: # Small range (e.g., 0.8-1.2)
134
+ num_steps = 5
135
+ elif ratio <= 3.0: # Medium range (e.g., 0.5-1.5)
136
+ num_steps = 7
137
+ else: # Large range (e.g., 0.5-2.0)
138
+ num_steps = 9
139
+
140
+ # Generate logarithmically spaced scales
141
+ log_min = np.log(min_scale)
142
+ log_max = np.log(max_scale)
143
+ log_scales = np.linspace(log_min, log_max, num_steps)
144
+ scales = np.exp(log_scales).tolist()
145
+
146
+ # Ensure 1.0 is included if in range
147
+ if min_scale <= 1.0 <= max_scale and 1.0 not in scales:
148
+ # Find closest scale and replace with 1.0
149
+ closest_idx = np.argmin([abs(s - 1.0) for s in scales])
150
+ scales[closest_idx] = 1.0
151
+
152
+ return scales
153
+
154
+ elif len(sizes) == 3:
155
+ # Explicit (min, max, step)
156
+ min_scale, max_scale, step = sizes
157
+ scales = []
158
+ current = min_scale
159
+ while current <= max_scale:
160
+ scales.append(current)
161
+ current += step
162
+ # Ensure max is included if close
163
+ if scales[-1] < max_scale and (max_scale - scales[-1]) < step * 0.1:
164
+ scales[-1] = max_scale
165
+ return scales
166
+
167
+ raise ValueError(f"Invalid sizes format: {sizes}")
168
+
169
+ def find_matches_in_image(
170
+ self,
171
+ template: Image.Image,
172
+ target: Image.Image,
173
+ template_hash: Optional[int] = None,
174
+ confidence_threshold: float = 0.6,
175
+ step_factor: float = 0.1,
176
+ sizes: Optional[Union[float, Tuple, List]] = None,
177
+ show_progress: bool = True,
178
+ progress_callback: Optional[Callable[[], None]] = None,
179
+ ) -> List[MatchCandidate]:
180
+ """
181
+ Find all matches of template in target image using sliding window.
182
+
183
+ Args:
184
+ template: Template image to search for
185
+ target: Target image to search in
186
+ template_hash: Pre-computed hash of template (optional)
187
+ confidence_threshold: Minimum similarity score (0-1)
188
+ step_factor: Step size as fraction of template size
189
+ sizes: Size variations to search. Can be:
190
+ - float: ±percentage (e.g., 0.2 = 80%-120%)
191
+ - tuple(min, max): search range with smart logarithmic steps
192
+ - tuple(min, max, step): explicit step size
193
+ - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
194
+ show_progress: Show progress bar for sliding window search
195
+ progress_callback: Optional callback function to call for each window checked
196
+
197
+ Returns:
198
+ List of MatchCandidate objects
199
+ """
200
+ matches = []
201
+
202
+ # Compute template hash if not provided
203
+ if template_hash is None:
204
+ template_hash = compute_phash(template, self.hash_size)
205
+
206
+ template_w, template_h = template.size
207
+ target_w, target_h = target.size
208
+
209
+ # Determine scales to search
210
+ scales = self._get_search_scales(sizes)
211
+
212
+ # Calculate total iterations for progress bar
213
+ total_iterations = 0
214
+ if show_progress and not progress_callback:
215
+ for scale in scales:
216
+ scaled_w = int(template_w * scale)
217
+ scaled_h = int(template_h * scale)
218
+ if scaled_w <= target_w and scaled_h <= target_h:
219
+ step_x = max(1, int(scaled_w * step_factor))
220
+ step_y = max(1, int(scaled_h * step_factor))
221
+ x_steps = len(range(0, target_w - scaled_w + 1, step_x))
222
+ y_steps = len(range(0, target_h - scaled_h + 1, step_y))
223
+ total_iterations += x_steps * y_steps
224
+
225
+ # Setup progress bar if needed (only if no callback provided)
226
+ progress_bar = None
227
+ if show_progress and not progress_callback and total_iterations > 0:
228
+ progress_bar = tqdm(total=total_iterations, desc="Scanning", unit="window", leave=False)
229
+
230
+ # Search at each scale
231
+ for scale in scales:
232
+ # Scale template size
233
+ scaled_w = int(template_w * scale)
234
+ scaled_h = int(template_h * scale)
235
+
236
+ if scaled_w > target_w or scaled_h > target_h:
237
+ continue
238
+
239
+ # Calculate step size
240
+ step_x = max(1, int(scaled_w * step_factor))
241
+ step_y = max(1, int(scaled_h * step_factor))
242
+
243
+ # Sliding window search
244
+ for y in range(0, target_h - scaled_h + 1, step_y):
245
+ for x in range(0, target_w - scaled_w + 1, step_x):
246
+ # Extract window
247
+ window = target.crop((x, y, x + scaled_w, y + scaled_h))
248
+
249
+ # Resize to template size if scaled
250
+ if scale != 1.0:
251
+ window = window.resize((template_w, template_h), Image.Resampling.LANCZOS)
252
+
253
+ # Compute hash and similarity
254
+ window_hash = compute_phash(window, self.hash_size)
255
+ similarity = hash_similarity(template_hash, window_hash, self.hash_bits)
256
+
257
+ if similarity >= confidence_threshold:
258
+ # Convert back to target image coordinates
259
+ bbox = (x, y, x + scaled_w, y + scaled_h)
260
+ matches.append(MatchCandidate(bbox, window_hash, similarity))
261
+
262
+ # Update progress
263
+ if progress_bar:
264
+ progress_bar.update(1)
265
+ elif progress_callback:
266
+ progress_callback()
267
+
268
+ # Close progress bar
269
+ if progress_bar:
270
+ progress_bar.close()
271
+
272
+ # Remove overlapping matches (keep highest confidence)
273
+ return self._filter_overlapping_matches(matches)
274
+
275
+ def _filter_overlapping_matches(
276
+ self, matches: List[MatchCandidate], overlap_threshold: float = 0.5
277
+ ) -> List[MatchCandidate]:
278
+ """Remove overlapping matches, keeping the highest confidence ones"""
279
+ if not matches:
280
+ return matches
281
+
282
+ # Sort by confidence (highest first)
283
+ sorted_matches = sorted(matches, key=lambda m: m.confidence, reverse=True)
284
+ filtered = []
285
+
286
+ for candidate in sorted_matches:
287
+ # Check if this overlaps significantly with any already selected match
288
+ keep = True
289
+ for selected in filtered:
290
+ overlap = self._calculate_overlap(candidate.bbox, selected.bbox)
291
+ if overlap > overlap_threshold:
292
+ keep = False
293
+ break
294
+
295
+ if keep:
296
+ filtered.append(candidate)
297
+
298
+ return filtered
299
+
300
+ def _calculate_overlap(self, bbox1: Tuple, bbox2: Tuple) -> float:
301
+ """Calculate intersection over union (IoU) for two bboxes"""
302
+ x1_min, y1_min, x1_max, y1_max = bbox1
303
+ x2_min, y2_min, x2_max, y2_max = bbox2
304
+
305
+ # Calculate intersection
306
+ intersect_xmin = max(x1_min, x2_min)
307
+ intersect_ymin = max(y1_min, y2_min)
308
+ intersect_xmax = min(x1_max, x2_max)
309
+ intersect_ymax = min(y1_max, y2_max)
310
+
311
+ if intersect_xmax < intersect_xmin or intersect_ymax < intersect_ymin:
312
+ return 0.0
313
+
314
+ intersect_area = (intersect_xmax - intersect_xmin) * (intersect_ymax - intersect_ymin)
315
+
316
+ # Calculate union
317
+ area1 = (x1_max - x1_min) * (y1_max - y1_min)
318
+ area2 = (x2_max - x2_min) * (y2_max - y2_min)
319
+ union_area = area1 + area2 - intersect_area
320
+
321
+ return intersect_area / union_area if union_area > 0 else 0.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -27,24 +27,24 @@ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
28
  natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
29
29
  natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
30
- natural_pdf/core/page.py,sha256=4-il2WPMVX4hNSgQ5P6yLc1-3jXfi73WCrpF9912ct4,142472
30
+ natural_pdf/core/page.py,sha256=XrDePXZgXgB3w8hvxh4-EhPQnrwmw-0z-I_K24__OtY,142550
31
31
  natural_pdf/core/page_collection.py,sha256=hEeXs_fzB73XZ8ZkHz2kIuSgBYcVYydvGMMdGuB1rvw,52486
32
32
  natural_pdf/core/page_groupby.py,sha256=550ME6kd-h-2u75oUIIIqTYsmh8VvdQO1nXXioL8J6A,7378
33
- natural_pdf/core/pdf.py,sha256=q54DyhXwAS_zAmsBd3PsCezu1wyQOYmGmB3iKfP8gAM,101884
34
- natural_pdf/core/pdf_collection.py,sha256=8tM0qVWS1L5Hwv5cXuZ2X8znAYOjKmlERX62bksDlJU,30144
35
- natural_pdf/core/render_spec.py,sha256=3GTfnlv8JKzePrruLq_dNr3HFeWMVcZT2fwWmJN44NI,14456
33
+ natural_pdf/core/pdf.py,sha256=Loe6sbQzBp9VDeIAuDS3zQmeDWvQMj5SWIQMky5bPDA,101964
34
+ natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
35
+ natural_pdf/core/render_spec.py,sha256=rLicaS9EPyojpJcjy2Lzn5DLWQwjrFyDJyRo7jbjdGU,14505
36
36
  natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
37
37
  natural_pdf/describe/base.py,sha256=Of9WVo9XuShXoeyJr0RN2CpLhF_CeiOjazl-or53RKU,18173
38
38
  natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
39
39
  natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
40
40
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
41
41
  natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
42
- natural_pdf/elements/base.py,sha256=xXdNV1_gt4T_V_4m6qJDieWiysvJxUBhSEEAJzMOzqo,55094
42
+ natural_pdf/elements/base.py,sha256=aj-eXOQQlhKv9lYeUlUs9aKNcUebtG_dqxURZHZVZ58,55509
43
43
  natural_pdf/elements/element_collection.py,sha256=slCUnOT04sNOTjSGgmhjcCKKPVPtdDPwU7PX1ebzGMw,101342
44
44
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
45
45
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
46
46
  natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
47
- natural_pdf/elements/region.py,sha256=Onok5VzmF1CvMCa3UGLUszCuhL-CCGk_IgtSUDva-Cc,155314
47
+ natural_pdf/elements/region.py,sha256=RxWidI7oNrdbuuj94SfdFXmcSDTfy89uGCeVMQvAfks,155591
48
48
  natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
49
49
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
50
50
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -100,9 +100,13 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
100
100
  natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
101
101
  natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
102
102
  natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
103
+ natural_pdf/vision/__init__.py,sha256=RymMY-3WLQBlOZ4Dx4MmL9UH6I65hNjkwUJ7ymO5JfM,287
104
+ natural_pdf/vision/mixin.py,sha256=OJwBABr74TWxP5seTKUmGj5zE9mWsBP_UKWU-Pr8V9A,8720
105
+ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs,5119
106
+ natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
103
107
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
104
108
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
105
- natural_pdf-0.2.3.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
109
+ natural_pdf-0.2.4.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
106
110
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
107
111
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
108
112
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -119,8 +123,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
119
123
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
120
124
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
121
125
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
122
- natural_pdf-0.2.3.dist-info/METADATA,sha256=lyx6Cx1xPGhy-p1m0wRfTvv4zSJ4ZJnNo7DeGQZ99yU,6959
123
- natural_pdf-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
124
- natural_pdf-0.2.3.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
125
- natural_pdf-0.2.3.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
126
- natural_pdf-0.2.3.dist-info/RECORD,,
126
+ natural_pdf-0.2.4.dist-info/METADATA,sha256=G1tmes61GVEt6zLeDISuJZgceLQywIU-uRspGA_90Q8,6959
127
+ natural_pdf-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
128
+ natural_pdf-0.2.4.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
129
+ natural_pdf-0.2.4.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
130
+ natural_pdf-0.2.4.dist-info/RECORD,,