natural-pdf 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1192,7 +1192,7 @@ class Element(
1192
1192
  self,
1193
1193
  mode: Literal["show", "render"] = "show",
1194
1194
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
1195
- highlights: Optional[List[Dict[str, Any]]] = None,
1195
+ highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
1196
1196
  crop: Union[bool, Literal["content"]] = False,
1197
1197
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
1198
1198
  label: Optional[str] = None,
@@ -1203,7 +1203,7 @@ class Element(
1203
1203
  Args:
1204
1204
  mode: Rendering mode - 'show' includes highlights, 'render' is clean
1205
1205
  color: Color for highlighting this element in show mode
1206
- highlights: Additional highlight groups to show
1206
+ highlights: Additional highlight groups to show, or False to disable all highlights
1207
1207
  crop: Whether to crop to element bounds
1208
1208
  crop_bbox: Explicit crop bounds
1209
1209
  label: Optional label for this element
@@ -1225,19 +1225,23 @@ class Element(
1225
1225
  if hasattr(self, "bbox") and self.bbox:
1226
1226
  spec.crop_bbox = self.bbox
1227
1227
 
1228
- # Add highlight in show mode
1229
- if mode == "show":
1230
- # Use provided label or generate one
1231
- element_label = label if label is not None else self.__class__.__name__
1232
-
1233
- spec.add_highlight(
1234
- element=self,
1235
- color=color or "red", # Default red for single element
1236
- label=element_label,
1237
- )
1228
+ # Add highlight in show mode (unless explicitly disabled with highlights=False)
1229
+ if mode == "show" and highlights is not False:
1230
+ # Only highlight this element if:
1231
+ # 1. We're not cropping, OR
1232
+ # 2. We're cropping but color was explicitly specified
1233
+ if not crop or color is not None:
1234
+ # Use provided label or generate one
1235
+ element_label = label if label is not None else self.__class__.__name__
1236
+
1237
+ spec.add_highlight(
1238
+ element=self,
1239
+ color=color or "red", # Default red for single element
1240
+ label=element_label,
1241
+ )
1238
1242
 
1239
- # Add additional highlight groups if provided
1240
- if highlights:
1243
+ # Add additional highlight groups if provided (and highlights is a list)
1244
+ if highlights and isinstance(highlights, list):
1241
1245
  for group in highlights:
1242
1246
  group_elements = group.get("elements", [])
1243
1247
  group_color = group.get("color", color)
@@ -221,7 +221,7 @@ class Region(
221
221
  self,
222
222
  mode: Literal["show", "render"] = "show",
223
223
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
224
- highlights: Optional[List[Dict[str, Any]]] = None,
224
+ highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
225
225
  crop: Union[bool, Literal["content"]] = True, # Default to True for regions
226
226
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
227
227
  **kwargs,
@@ -231,7 +231,7 @@ class Region(
231
231
  Args:
232
232
  mode: Rendering mode - 'show' includes highlights, 'render' is clean
233
233
  color: Color for highlighting this region in show mode
234
- highlights: Additional highlight groups to show
234
+ highlights: Additional highlight groups to show, or False to disable all highlights
235
235
  crop: Whether to crop to this region
236
236
  crop_bbox: Explicit crop bounds (overrides region bounds)
237
237
  **kwargs: Additional parameters
@@ -250,10 +250,12 @@ class Region(
250
250
  # Crop to this region's bounds
251
251
  spec.crop_bbox = self.bbox
252
252
 
253
- # Add highlights in show mode
254
- if mode == "show":
255
- # Highlight this region
256
- if color or mode == "show": # Always highlight in show mode
253
+ # Add highlights in show mode (unless explicitly disabled with highlights=False)
254
+ if mode == "show" and highlights is not False:
255
+ # Only highlight this region if:
256
+ # 1. We're not cropping, OR
257
+ # 2. We're cropping but color was explicitly specified
258
+ if not crop or color is not None:
257
259
  spec.add_highlight(
258
260
  bbox=self.bbox,
259
261
  polygon=self.polygon if self.has_polygon else None,
@@ -261,8 +263,8 @@ class Region(
261
263
  label=self.label or self.name or "Region",
262
264
  )
263
265
 
264
- # Add additional highlight groups if provided
265
- if highlights:
266
+ # Add additional highlight groups if provided (and highlights is a list)
267
+ if highlights and isinstance(highlights, list):
266
268
  for group in highlights:
267
269
  elements = group.get("elements", [])
268
270
  group_color = group.get("color", color)
@@ -1234,6 +1236,7 @@ class Region(
1234
1236
  content_filter: Optional[
1235
1237
  Union[str, Callable[[str], bool], List[str]]
1236
1238
  ] = None, # NEW: Content filtering
1239
+ apply_exclusions: bool = True, # Whether to apply exclusion regions during extraction
1237
1240
  ) -> TableResult: # Return type allows Optional[str] for cells
1238
1241
  """
1239
1242
  Extract a table from this region.
@@ -1258,6 +1261,8 @@ class Region(
1258
1261
  - A callable that takes text and returns True to KEEP the character
1259
1262
  - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
1260
1263
  Works with all extraction methods by filtering cell content.
1264
+ apply_exclusions: Whether to apply exclusion regions during text extraction (default: True).
1265
+ When True, text within excluded regions (e.g., headers/footers) will not be extracted.
1261
1266
 
1262
1267
  Returns:
1263
1268
  Table data as a list of rows, where each row is a list of cell values (str or None).
@@ -1295,7 +1300,9 @@ class Region(
1295
1300
  )
1296
1301
  return TableResult(
1297
1302
  self._extract_table_from_cells(
1298
- cell_regions_in_table, content_filter=content_filter
1303
+ cell_regions_in_table,
1304
+ content_filter=content_filter,
1305
+ apply_exclusions=apply_exclusions,
1299
1306
  )
1300
1307
  )
1301
1308
 
@@ -1379,16 +1386,22 @@ class Region(
1379
1386
  # Use the selected method
1380
1387
  if effective_method == "tatr":
1381
1388
  table_rows = self._extract_table_tatr(
1382
- use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter
1389
+ use_ocr=use_ocr,
1390
+ ocr_config=ocr_config,
1391
+ content_filter=content_filter,
1392
+ apply_exclusions=apply_exclusions,
1383
1393
  )
1384
1394
  elif effective_method == "text":
1385
1395
  current_text_options = text_options.copy()
1386
1396
  current_text_options["cell_extraction_func"] = cell_extraction_func
1387
1397
  current_text_options["show_progress"] = show_progress
1388
1398
  current_text_options["content_filter"] = content_filter
1399
+ current_text_options["apply_exclusions"] = apply_exclusions
1389
1400
  table_rows = self._extract_table_text(**current_text_options)
1390
1401
  elif effective_method == "pdfplumber":
1391
- table_rows = self._extract_table_plumber(table_settings, content_filter=content_filter)
1402
+ table_rows = self._extract_table_plumber(
1403
+ table_settings, content_filter=content_filter, apply_exclusions=apply_exclusions
1404
+ )
1392
1405
  else:
1393
1406
  raise ValueError(
1394
1407
  f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
@@ -1602,7 +1615,9 @@ class Region(
1602
1615
  # Return empty list if no tables found
1603
1616
  return []
1604
1617
 
1605
- def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
1618
+ def _extract_table_plumber(
1619
+ self, table_settings: dict, content_filter=None, apply_exclusions=True
1620
+ ) -> List[List[str]]:
1606
1621
  """
1607
1622
  Extract table using pdfplumber's table extraction.
1608
1623
  This method extracts the largest table within the region.
@@ -1644,7 +1659,7 @@ class Region(
1644
1659
  # -------------------------------------------------------------
1645
1660
  base_plumber_page = self.page._page
1646
1661
 
1647
- if getattr(self.page, "_exclusions", None):
1662
+ if apply_exclusions and getattr(self.page, "_exclusions", None):
1648
1663
  exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
1649
1664
 
1650
1665
  def _keep_char(obj):
@@ -1699,7 +1714,7 @@ class Region(
1699
1714
  return []
1700
1715
 
1701
1716
  def _extract_table_tatr(
1702
- self, use_ocr=False, ocr_config=None, content_filter=None
1717
+ self, use_ocr=False, ocr_config=None, content_filter=None, apply_exclusions=True
1703
1718
  ) -> List[List[str]]:
1704
1719
  """
1705
1720
  Extract table using TATR structure detection.
@@ -1787,7 +1802,7 @@ class Region(
1787
1802
  continue
1788
1803
 
1789
1804
  # Fallback to normal extraction
1790
- header_text = header.extract_text().strip()
1805
+ header_text = header.extract_text(apply_exclusions=apply_exclusions).strip()
1791
1806
  if content_filter is not None:
1792
1807
  header_text = self._apply_content_filter_to_text(header_text, content_filter)
1793
1808
  header_texts.append(header_text)
@@ -1822,7 +1837,7 @@ class Region(
1822
1837
  continue
1823
1838
 
1824
1839
  # Fallback to normal extraction
1825
- cell_text = cell_region.extract_text().strip()
1840
+ cell_text = cell_region.extract_text(apply_exclusions=apply_exclusions).strip()
1826
1841
  if content_filter is not None:
1827
1842
  cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
1828
1843
  row_cells.append(cell_text)
@@ -1838,7 +1853,7 @@ class Region(
1838
1853
  continue
1839
1854
 
1840
1855
  # Fallback to normal extraction
1841
- row_text = row.extract_text().strip()
1856
+ row_text = row.extract_text(apply_exclusions=apply_exclusions).strip()
1842
1857
  if content_filter is not None:
1843
1858
  row_text = self._apply_content_filter_to_text(row_text, content_filter)
1844
1859
  row_cells.append(row_text)
@@ -1864,6 +1879,8 @@ class Region(
1864
1879
  show_progress = text_options.pop("show_progress", False)
1865
1880
  # --- Get content_filter option --- #
1866
1881
  content_filter = text_options.pop("content_filter", None)
1882
+ # --- Get apply_exclusions option --- #
1883
+ apply_exclusions = text_options.pop("apply_exclusions", True)
1867
1884
 
1868
1885
  # Analyze structure first (or use cached results)
1869
1886
  if "text_table_structure" in self.analyses:
@@ -1944,7 +1961,9 @@ class Region(
1944
1961
  cell_value = None
1945
1962
  else:
1946
1963
  cell_value = cell_region.extract_text(
1947
- layout=False, apply_exclusions=False, content_filter=content_filter
1964
+ layout=False,
1965
+ apply_exclusions=apply_exclusions,
1966
+ content_filter=content_filter,
1948
1967
  ).strip()
1949
1968
 
1950
1969
  rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
@@ -3395,7 +3414,7 @@ class Region(
3395
3414
  # ------------------------------------------------------------------
3396
3415
 
3397
3416
  def _extract_table_from_cells(
3398
- self, cell_regions: List["Region"], content_filter=None
3417
+ self, cell_regions: List["Region"], content_filter=None, apply_exclusions=True
3399
3418
  ) -> List[List[Optional[str]]]:
3400
3419
  """Construct a table (list-of-lists) from table_cell regions.
3401
3420
 
@@ -3437,7 +3456,9 @@ class Region(
3437
3456
  r_idx = int(cell.metadata.get("row_index"))
3438
3457
  c_idx = int(cell.metadata.get("col_index"))
3439
3458
  text_val = cell.extract_text(
3440
- layout=False, apply_exclusions=True, content_filter=content_filter
3459
+ layout=False,
3460
+ apply_exclusions=apply_exclusions,
3461
+ content_filter=content_filter,
3441
3462
  ).strip()
3442
3463
  table_grid[r_idx][c_idx] = text_val if text_val else None
3443
3464
  except Exception as _err:
@@ -3486,7 +3507,7 @@ class Region(
3486
3507
  col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
3487
3508
 
3488
3509
  text_val = cell.extract_text(
3489
- layout=False, apply_exclusions=False, content_filter=content_filter
3510
+ layout=False, apply_exclusions=apply_exclusions, content_filter=content_filter
3490
3511
  ).strip()
3491
3512
  table_grid[row_idx][col_idx] = text_val if text_val else None
3492
3513
 
@@ -41,7 +41,7 @@ class TableResult(Sequence):
41
41
 
42
42
  def to_df(
43
43
  self,
44
- header: Union[str, int, List[int], None] = "first",
44
+ header: Union[str, int, List[int], List[str], None] = "first",
45
45
  index_col=None,
46
46
  skip_repeating_headers=None,
47
47
  keep_blank: bool = False,
@@ -51,8 +51,8 @@ class TableResult(Sequence):
51
51
 
52
52
  Parameters
53
53
  ----------
54
- header : "first" | int | list[int] | None, default "first"
55
- • "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
54
+ header : "first" | int | list[int] | list[str] | None, default "first"
55
+ • "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • list[str] – custom column names.\n • None/False– no header.
56
56
 
57
57
  Note: If the header row has a different number of columns than the
58
58
  body rows, the method will automatically fall back to header=None
@@ -84,7 +84,11 @@ class TableResult(Sequence):
84
84
 
85
85
  # Determine default for skip_repeating_headers based on header parameter
86
86
  if skip_repeating_headers is None:
87
- skip_repeating_headers = header is not None and header is not False
87
+ skip_repeating_headers = (
88
+ header is not None
89
+ and header is not False
90
+ and not (isinstance(header, (list, tuple)) and len(header) == 0)
91
+ )
88
92
 
89
93
  # Determine header rows and body rows
90
94
  body = rows
@@ -97,10 +101,31 @@ class TableResult(Sequence):
97
101
  elif isinstance(header, int):
98
102
  hdr = rows[header]
99
103
  body = rows[:header] + rows[header + 1 :]
100
- elif isinstance(header, (list, tuple)):
104
+ elif isinstance(header, (list, tuple)) and all(isinstance(i, int) for i in header):
105
+ # List of integers - multi-row header
101
106
  hdr_rows = [rows[i] for i in header]
102
107
  body = [r for idx, r in enumerate(rows) if idx not in header]
103
108
  hdr = hdr_rows
109
+ elif (
110
+ isinstance(header, (list, tuple))
111
+ and len(header) > 0
112
+ and all(isinstance(i, str) for i in header)
113
+ ):
114
+ # List of strings - custom column names
115
+ hdr = list(header)
116
+ body = rows
117
+ # Validate column count matches
118
+ if body:
119
+ max_cols = max(len(row) for row in body)
120
+ if len(hdr) != max_cols:
121
+ raise ValueError(
122
+ f"Number of column names ({len(hdr)}) must match "
123
+ f"number of columns in data ({max_cols})"
124
+ )
125
+ elif isinstance(header, (list, tuple)) and len(header) == 0:
126
+ # Empty list behaves like None
127
+ hdr = None
128
+ body = rows
104
129
  else:
105
130
  raise ValueError("Invalid value for header parameter")
106
131
 
@@ -125,7 +150,12 @@ class TableResult(Sequence):
125
150
  pass
126
151
 
127
152
  # Check for header/body column count mismatch and fallback to no header
128
- if hdr is not None and body:
153
+ if (
154
+ hdr is not None
155
+ and body
156
+ and not (isinstance(header, (list, tuple)) and all(isinstance(i, str) for i in header))
157
+ ):
158
+ # Skip this check for custom string headers
129
159
  # Get the maximum number of columns from all body rows
130
160
  # This handles cases where some rows have different column counts
131
161
  max_cols = max(len(row) for row in body) if body else 0
@@ -144,6 +174,9 @@ class TableResult(Sequence):
144
174
  hdr = None
145
175
  body = self._rows # Use all rows as body
146
176
 
177
+ # Handle empty list case - pandas needs None not empty list
178
+ if isinstance(hdr, list) and len(hdr) == 0:
179
+ hdr = None
147
180
  df = pd.DataFrame(body, columns=hdr)
148
181
 
149
182
  # Convert empty strings to NaN by default
@@ -0,0 +1,7 @@
1
+ """Vision module for visual similarity and pattern matching"""
2
+
3
+ from .mixin import VisualSearchMixin
4
+ from .results import Match, MatchResults
5
+ from .similarity import VisualMatcher, compute_phash
6
+
7
+ __all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]
@@ -0,0 +1,209 @@
1
+ """Mixin to add visual similarity search to Page/PDF/PDFCollection"""
2
+
3
+ from typing import List, Optional, Tuple, Union
4
+
5
+ import numpy as np
6
+ from PIL import Image
7
+ from tqdm.auto import tqdm
8
+
9
+ from .results import Match, MatchResults
10
+ from .similarity import VisualMatcher, compute_phash
11
+
12
+
13
+ class VisualSearchMixin:
14
+ """Add find_similar method to classes that include this mixin"""
15
+
16
+ def find_similar(
17
+ self,
18
+ examples: Union["Element", "Region", List[Union["Element", "Region"]]],
19
+ using: str = "vision",
20
+ confidence: float = 0.6,
21
+ sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
22
+ resolution: int = 72,
23
+ hash_size: int = 20,
24
+ step_factor: float = 0.1,
25
+ max_per_page: Optional[int] = None,
26
+ show_progress: bool = True,
27
+ **kwargs,
28
+ ) -> MatchResults:
29
+ """
30
+ Find regions visually similar to the given example(s).
31
+
32
+ Args:
33
+ examples: Single element/region or list of examples to search for
34
+ using: Search method - currently only 'vision' is supported
35
+ confidence: Minimum similarity score (0-1)
36
+ sizes: Size variations to search. Can be:
37
+ - float: ±percentage (e.g., 0.2 = 80%-120%)
38
+ - tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
39
+ - tuple(min, max, step): explicit step size
40
+ - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
41
+ resolution: Resolution for image comparison (DPI) (default: 72)
42
+ hash_size: Size of perceptual hash grid (default: 12)
43
+ step_factor: Step size as fraction of template size (default: 0.1)
44
+ max_per_page: Maximum matches to return per page
45
+ show_progress: Show progress bar for multi-page searches (default: True)
46
+ **kwargs: Additional options
47
+
48
+ Returns:
49
+ MatchResults collection
50
+ """
51
+ if using != "vision":
52
+ raise NotImplementedError(f"using='{using}' not yet supported")
53
+
54
+ # Ensure examples is a list
55
+ if not isinstance(examples, list):
56
+ examples = [examples]
57
+
58
+ # Initialize matcher with specified hash size
59
+ matcher = VisualMatcher(hash_size=hash_size)
60
+
61
+ # Prepare templates
62
+ templates = []
63
+ for example in examples:
64
+ # Render the example region/element
65
+ example_image = example.render(resolution=resolution, crop=True)
66
+ template_hash = compute_phash(example_image, hash_size=hash_size)
67
+ templates.append({"image": example_image, "hash": template_hash, "source": example})
68
+
69
+ # Get pages to search based on the object type
70
+ if hasattr(self, "__class__") and self.__class__.__name__ == "PDFCollection":
71
+ # PDFCollection needs to iterate through all PDFs
72
+ pages_to_search = []
73
+ for pdf in self:
74
+ pages_to_search.extend(pdf.pages)
75
+ elif hasattr(self, "pages"): # PDF
76
+ pages_to_search = self.pages
77
+ elif hasattr(self, "number"): # Single page
78
+ pages_to_search = [self]
79
+ else:
80
+ raise TypeError(f"Cannot search in {type(self)}")
81
+
82
+ # Calculate total operations for progress bar
83
+ total_operations = 0
84
+ if show_progress:
85
+ # Get scales that will be searched
86
+ scales = matcher._get_search_scales(sizes)
87
+
88
+ # Pre-calculate for all pages and templates
89
+ for page in pages_to_search:
90
+ # Estimate page image size
91
+ page_w = int(page.width * resolution / 72.0)
92
+ page_h = int(page.height * resolution / 72.0)
93
+
94
+ for template_data in templates:
95
+ template_w, template_h = template_data["image"].size
96
+
97
+ for scale in scales:
98
+ scaled_w = int(template_w * scale)
99
+ scaled_h = int(template_h * scale)
100
+
101
+ if scaled_w <= page_w and scaled_h <= page_h:
102
+ step_x = max(1, int(scaled_w * step_factor))
103
+ step_y = max(1, int(scaled_h * step_factor))
104
+
105
+ x_windows = len(range(0, page_w - scaled_w + 1, step_x))
106
+ y_windows = len(range(0, page_h - scaled_h + 1, step_y))
107
+ total_operations += x_windows * y_windows
108
+
109
+ # Search each page
110
+ all_matches = []
111
+
112
+ # Create single progress bar for all operations
113
+ progress_bar = None
114
+ operations_done = 0
115
+ last_update = 0
116
+ update_frequency = max(1, total_operations // 1000) # Update at most 1000 times
117
+
118
+ if show_progress and total_operations > 0:
119
+ progress_bar = tqdm(
120
+ total=total_operations,
121
+ desc="Searching",
122
+ unit="window",
123
+ miniters=update_frequency, # Minimum iterations between updates
124
+ mininterval=0.1, # Minimum time between updates (seconds)
125
+ )
126
+
127
+ for page_idx, page in enumerate(pages_to_search):
128
+ # Render the full page once
129
+ page_image = page.render(resolution=resolution)
130
+
131
+ # Convert page coordinates to image coordinates
132
+ scale = resolution / 72.0 # PDF is 72 DPI
133
+
134
+ page_matches = []
135
+
136
+ # Search for each template
137
+ for template_idx, template_data in enumerate(templates):
138
+ template_image = template_data["image"]
139
+ template_hash = template_data["hash"]
140
+
141
+ # Custom progress callback to update our main progress bar
142
+ def update_progress():
143
+ nonlocal operations_done, last_update
144
+ operations_done += 1
145
+
146
+ # Only update progress bar every N operations to avoid overwhelming output
147
+ if progress_bar and (
148
+ operations_done - last_update >= update_frequency
149
+ or operations_done == total_operations
150
+ ):
151
+ progress_bar.update(operations_done - last_update)
152
+ last_update = operations_done
153
+
154
+ # Update description with current page/template info
155
+ if len(pages_to_search) > 1:
156
+ progress_bar.set_description(
157
+ f"Page {page.number}/{len(pages_to_search)}"
158
+ )
159
+ elif len(templates) > 1:
160
+ progress_bar.set_description(
161
+ f"Template {template_idx + 1}/{len(templates)}"
162
+ )
163
+
164
+ # Find matches in this page - never show internal progress
165
+ candidates = matcher.find_matches_in_image(
166
+ template_image,
167
+ page_image,
168
+ template_hash=template_hash,
169
+ confidence_threshold=confidence,
170
+ sizes=sizes,
171
+ step_factor=step_factor,
172
+ show_progress=False, # We handle progress ourselves
173
+ progress_callback=update_progress if progress_bar else None,
174
+ **kwargs,
175
+ )
176
+
177
+ # Convert image coordinates back to PDF coordinates
178
+ for candidate in candidates:
179
+ img_x0, img_y0, img_x1, img_y1 = candidate.bbox
180
+
181
+ # Convert from image pixels to PDF points
182
+ # No flipping needed! PDF coordinates map directly to PIL coordinates
183
+ pdf_x0 = img_x0 / scale
184
+ pdf_y0 = img_y0 / scale
185
+ pdf_x1 = img_x1 / scale
186
+ pdf_y1 = img_y1 / scale
187
+
188
+ # Create Match object
189
+ match = Match(
190
+ page=page,
191
+ bbox=(pdf_x0, pdf_y0, pdf_x1, pdf_y1),
192
+ confidence=candidate.confidence,
193
+ source_example=template_data["source"],
194
+ )
195
+ page_matches.append(match)
196
+
197
+ # Apply max_per_page limit if specified
198
+ if max_per_page and len(page_matches) > max_per_page:
199
+ # Sort by confidence and take top N
200
+ page_matches.sort(key=lambda m: m.confidence, reverse=True)
201
+ page_matches = page_matches[:max_per_page]
202
+
203
+ all_matches.extend(page_matches)
204
+
205
+ # Close progress bar
206
+ if progress_bar:
207
+ progress_bar.close()
208
+
209
+ return MatchResults(all_matches)