natural-pdf 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +185 -9
- natural_pdf/core/element_manager.py +5 -0
- natural_pdf/core/page.py +42 -4
- natural_pdf/core/pdf.py +45 -3
- natural_pdf/core/pdf_collection.py +131 -4
- natural_pdf/core/render_spec.py +2 -2
- natural_pdf/elements/base.py +18 -14
- natural_pdf/elements/region.py +42 -21
- natural_pdf/tables/result.py +39 -6
- natural_pdf/vision/__init__.py +7 -0
- natural_pdf/vision/mixin.py +209 -0
- natural_pdf/vision/results.py +146 -0
- natural_pdf/vision/similarity.py +321 -0
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/RECORD +19 -15
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.3.dist-info → natural_pdf-0.2.5.dist-info}/top_level.txt +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -1192,7 +1192,7 @@ class Element(
|
|
1192
1192
|
self,
|
1193
1193
|
mode: Literal["show", "render"] = "show",
|
1194
1194
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
1195
|
-
highlights: Optional[List[Dict[str, Any]]] = None,
|
1195
|
+
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
1196
1196
|
crop: Union[bool, Literal["content"]] = False,
|
1197
1197
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
1198
1198
|
label: Optional[str] = None,
|
@@ -1203,7 +1203,7 @@ class Element(
|
|
1203
1203
|
Args:
|
1204
1204
|
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
1205
1205
|
color: Color for highlighting this element in show mode
|
1206
|
-
highlights: Additional highlight groups to show
|
1206
|
+
highlights: Additional highlight groups to show, or False to disable all highlights
|
1207
1207
|
crop: Whether to crop to element bounds
|
1208
1208
|
crop_bbox: Explicit crop bounds
|
1209
1209
|
label: Optional label for this element
|
@@ -1225,19 +1225,23 @@ class Element(
|
|
1225
1225
|
if hasattr(self, "bbox") and self.bbox:
|
1226
1226
|
spec.crop_bbox = self.bbox
|
1227
1227
|
|
1228
|
-
# Add highlight in show mode
|
1229
|
-
if mode == "show":
|
1230
|
-
#
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1228
|
+
# Add highlight in show mode (unless explicitly disabled with highlights=False)
|
1229
|
+
if mode == "show" and highlights is not False:
|
1230
|
+
# Only highlight this element if:
|
1231
|
+
# 1. We're not cropping, OR
|
1232
|
+
# 2. We're cropping but color was explicitly specified
|
1233
|
+
if not crop or color is not None:
|
1234
|
+
# Use provided label or generate one
|
1235
|
+
element_label = label if label is not None else self.__class__.__name__
|
1236
|
+
|
1237
|
+
spec.add_highlight(
|
1238
|
+
element=self,
|
1239
|
+
color=color or "red", # Default red for single element
|
1240
|
+
label=element_label,
|
1241
|
+
)
|
1238
1242
|
|
1239
|
-
# Add additional highlight groups if provided
|
1240
|
-
if highlights:
|
1243
|
+
# Add additional highlight groups if provided (and highlights is a list)
|
1244
|
+
if highlights and isinstance(highlights, list):
|
1241
1245
|
for group in highlights:
|
1242
1246
|
group_elements = group.get("elements", [])
|
1243
1247
|
group_color = group.get("color", color)
|
natural_pdf/elements/region.py
CHANGED
@@ -221,7 +221,7 @@ class Region(
|
|
221
221
|
self,
|
222
222
|
mode: Literal["show", "render"] = "show",
|
223
223
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
224
|
-
highlights: Optional[List[Dict[str, Any]]] = None,
|
224
|
+
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
225
225
|
crop: Union[bool, Literal["content"]] = True, # Default to True for regions
|
226
226
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
227
227
|
**kwargs,
|
@@ -231,7 +231,7 @@ class Region(
|
|
231
231
|
Args:
|
232
232
|
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
233
233
|
color: Color for highlighting this region in show mode
|
234
|
-
highlights: Additional highlight groups to show
|
234
|
+
highlights: Additional highlight groups to show, or False to disable all highlights
|
235
235
|
crop: Whether to crop to this region
|
236
236
|
crop_bbox: Explicit crop bounds (overrides region bounds)
|
237
237
|
**kwargs: Additional parameters
|
@@ -250,10 +250,12 @@ class Region(
|
|
250
250
|
# Crop to this region's bounds
|
251
251
|
spec.crop_bbox = self.bbox
|
252
252
|
|
253
|
-
# Add highlights in show mode
|
254
|
-
if mode == "show":
|
255
|
-
#
|
256
|
-
|
253
|
+
# Add highlights in show mode (unless explicitly disabled with highlights=False)
|
254
|
+
if mode == "show" and highlights is not False:
|
255
|
+
# Only highlight this region if:
|
256
|
+
# 1. We're not cropping, OR
|
257
|
+
# 2. We're cropping but color was explicitly specified
|
258
|
+
if not crop or color is not None:
|
257
259
|
spec.add_highlight(
|
258
260
|
bbox=self.bbox,
|
259
261
|
polygon=self.polygon if self.has_polygon else None,
|
@@ -261,8 +263,8 @@ class Region(
|
|
261
263
|
label=self.label or self.name or "Region",
|
262
264
|
)
|
263
265
|
|
264
|
-
# Add additional highlight groups if provided
|
265
|
-
if highlights:
|
266
|
+
# Add additional highlight groups if provided (and highlights is a list)
|
267
|
+
if highlights and isinstance(highlights, list):
|
266
268
|
for group in highlights:
|
267
269
|
elements = group.get("elements", [])
|
268
270
|
group_color = group.get("color", color)
|
@@ -1234,6 +1236,7 @@ class Region(
|
|
1234
1236
|
content_filter: Optional[
|
1235
1237
|
Union[str, Callable[[str], bool], List[str]]
|
1236
1238
|
] = None, # NEW: Content filtering
|
1239
|
+
apply_exclusions: bool = True, # Whether to apply exclusion regions during extraction
|
1237
1240
|
) -> TableResult: # Return type allows Optional[str] for cells
|
1238
1241
|
"""
|
1239
1242
|
Extract a table from this region.
|
@@ -1258,6 +1261,8 @@ class Region(
|
|
1258
1261
|
- A callable that takes text and returns True to KEEP the character
|
1259
1262
|
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
1260
1263
|
Works with all extraction methods by filtering cell content.
|
1264
|
+
apply_exclusions: Whether to apply exclusion regions during text extraction (default: True).
|
1265
|
+
When True, text within excluded regions (e.g., headers/footers) will not be extracted.
|
1261
1266
|
|
1262
1267
|
Returns:
|
1263
1268
|
Table data as a list of rows, where each row is a list of cell values (str or None).
|
@@ -1295,7 +1300,9 @@ class Region(
|
|
1295
1300
|
)
|
1296
1301
|
return TableResult(
|
1297
1302
|
self._extract_table_from_cells(
|
1298
|
-
cell_regions_in_table,
|
1303
|
+
cell_regions_in_table,
|
1304
|
+
content_filter=content_filter,
|
1305
|
+
apply_exclusions=apply_exclusions,
|
1299
1306
|
)
|
1300
1307
|
)
|
1301
1308
|
|
@@ -1379,16 +1386,22 @@ class Region(
|
|
1379
1386
|
# Use the selected method
|
1380
1387
|
if effective_method == "tatr":
|
1381
1388
|
table_rows = self._extract_table_tatr(
|
1382
|
-
use_ocr=use_ocr,
|
1389
|
+
use_ocr=use_ocr,
|
1390
|
+
ocr_config=ocr_config,
|
1391
|
+
content_filter=content_filter,
|
1392
|
+
apply_exclusions=apply_exclusions,
|
1383
1393
|
)
|
1384
1394
|
elif effective_method == "text":
|
1385
1395
|
current_text_options = text_options.copy()
|
1386
1396
|
current_text_options["cell_extraction_func"] = cell_extraction_func
|
1387
1397
|
current_text_options["show_progress"] = show_progress
|
1388
1398
|
current_text_options["content_filter"] = content_filter
|
1399
|
+
current_text_options["apply_exclusions"] = apply_exclusions
|
1389
1400
|
table_rows = self._extract_table_text(**current_text_options)
|
1390
1401
|
elif effective_method == "pdfplumber":
|
1391
|
-
table_rows = self._extract_table_plumber(
|
1402
|
+
table_rows = self._extract_table_plumber(
|
1403
|
+
table_settings, content_filter=content_filter, apply_exclusions=apply_exclusions
|
1404
|
+
)
|
1392
1405
|
else:
|
1393
1406
|
raise ValueError(
|
1394
1407
|
f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
|
@@ -1602,7 +1615,9 @@ class Region(
|
|
1602
1615
|
# Return empty list if no tables found
|
1603
1616
|
return []
|
1604
1617
|
|
1605
|
-
def _extract_table_plumber(
|
1618
|
+
def _extract_table_plumber(
|
1619
|
+
self, table_settings: dict, content_filter=None, apply_exclusions=True
|
1620
|
+
) -> List[List[str]]:
|
1606
1621
|
"""
|
1607
1622
|
Extract table using pdfplumber's table extraction.
|
1608
1623
|
This method extracts the largest table within the region.
|
@@ -1644,7 +1659,7 @@ class Region(
|
|
1644
1659
|
# -------------------------------------------------------------
|
1645
1660
|
base_plumber_page = self.page._page
|
1646
1661
|
|
1647
|
-
if getattr(self.page, "_exclusions", None):
|
1662
|
+
if apply_exclusions and getattr(self.page, "_exclusions", None):
|
1648
1663
|
exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
|
1649
1664
|
|
1650
1665
|
def _keep_char(obj):
|
@@ -1699,7 +1714,7 @@ class Region(
|
|
1699
1714
|
return []
|
1700
1715
|
|
1701
1716
|
def _extract_table_tatr(
|
1702
|
-
self, use_ocr=False, ocr_config=None, content_filter=None
|
1717
|
+
self, use_ocr=False, ocr_config=None, content_filter=None, apply_exclusions=True
|
1703
1718
|
) -> List[List[str]]:
|
1704
1719
|
"""
|
1705
1720
|
Extract table using TATR structure detection.
|
@@ -1787,7 +1802,7 @@ class Region(
|
|
1787
1802
|
continue
|
1788
1803
|
|
1789
1804
|
# Fallback to normal extraction
|
1790
|
-
header_text = header.extract_text().strip()
|
1805
|
+
header_text = header.extract_text(apply_exclusions=apply_exclusions).strip()
|
1791
1806
|
if content_filter is not None:
|
1792
1807
|
header_text = self._apply_content_filter_to_text(header_text, content_filter)
|
1793
1808
|
header_texts.append(header_text)
|
@@ -1822,7 +1837,7 @@ class Region(
|
|
1822
1837
|
continue
|
1823
1838
|
|
1824
1839
|
# Fallback to normal extraction
|
1825
|
-
cell_text = cell_region.extract_text().strip()
|
1840
|
+
cell_text = cell_region.extract_text(apply_exclusions=apply_exclusions).strip()
|
1826
1841
|
if content_filter is not None:
|
1827
1842
|
cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
|
1828
1843
|
row_cells.append(cell_text)
|
@@ -1838,7 +1853,7 @@ class Region(
|
|
1838
1853
|
continue
|
1839
1854
|
|
1840
1855
|
# Fallback to normal extraction
|
1841
|
-
row_text = row.extract_text().strip()
|
1856
|
+
row_text = row.extract_text(apply_exclusions=apply_exclusions).strip()
|
1842
1857
|
if content_filter is not None:
|
1843
1858
|
row_text = self._apply_content_filter_to_text(row_text, content_filter)
|
1844
1859
|
row_cells.append(row_text)
|
@@ -1864,6 +1879,8 @@ class Region(
|
|
1864
1879
|
show_progress = text_options.pop("show_progress", False)
|
1865
1880
|
# --- Get content_filter option --- #
|
1866
1881
|
content_filter = text_options.pop("content_filter", None)
|
1882
|
+
# --- Get apply_exclusions option --- #
|
1883
|
+
apply_exclusions = text_options.pop("apply_exclusions", True)
|
1867
1884
|
|
1868
1885
|
# Analyze structure first (or use cached results)
|
1869
1886
|
if "text_table_structure" in self.analyses:
|
@@ -1944,7 +1961,9 @@ class Region(
|
|
1944
1961
|
cell_value = None
|
1945
1962
|
else:
|
1946
1963
|
cell_value = cell_region.extract_text(
|
1947
|
-
layout=False,
|
1964
|
+
layout=False,
|
1965
|
+
apply_exclusions=apply_exclusions,
|
1966
|
+
content_filter=content_filter,
|
1948
1967
|
).strip()
|
1949
1968
|
|
1950
1969
|
rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
|
@@ -3395,7 +3414,7 @@ class Region(
|
|
3395
3414
|
# ------------------------------------------------------------------
|
3396
3415
|
|
3397
3416
|
def _extract_table_from_cells(
|
3398
|
-
self, cell_regions: List["Region"], content_filter=None
|
3417
|
+
self, cell_regions: List["Region"], content_filter=None, apply_exclusions=True
|
3399
3418
|
) -> List[List[Optional[str]]]:
|
3400
3419
|
"""Construct a table (list-of-lists) from table_cell regions.
|
3401
3420
|
|
@@ -3437,7 +3456,9 @@ class Region(
|
|
3437
3456
|
r_idx = int(cell.metadata.get("row_index"))
|
3438
3457
|
c_idx = int(cell.metadata.get("col_index"))
|
3439
3458
|
text_val = cell.extract_text(
|
3440
|
-
layout=False,
|
3459
|
+
layout=False,
|
3460
|
+
apply_exclusions=apply_exclusions,
|
3461
|
+
content_filter=content_filter,
|
3441
3462
|
).strip()
|
3442
3463
|
table_grid[r_idx][c_idx] = text_val if text_val else None
|
3443
3464
|
except Exception as _err:
|
@@ -3486,7 +3507,7 @@ class Region(
|
|
3486
3507
|
col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
|
3487
3508
|
|
3488
3509
|
text_val = cell.extract_text(
|
3489
|
-
layout=False, apply_exclusions=
|
3510
|
+
layout=False, apply_exclusions=apply_exclusions, content_filter=content_filter
|
3490
3511
|
).strip()
|
3491
3512
|
table_grid[row_idx][col_idx] = text_val if text_val else None
|
3492
3513
|
|
natural_pdf/tables/result.py
CHANGED
@@ -41,7 +41,7 @@ class TableResult(Sequence):
|
|
41
41
|
|
42
42
|
def to_df(
|
43
43
|
self,
|
44
|
-
header: Union[str, int, List[int], None] = "first",
|
44
|
+
header: Union[str, int, List[int], List[str], None] = "first",
|
45
45
|
index_col=None,
|
46
46
|
skip_repeating_headers=None,
|
47
47
|
keep_blank: bool = False,
|
@@ -51,8 +51,8 @@ class TableResult(Sequence):
|
|
51
51
|
|
52
52
|
Parameters
|
53
53
|
----------
|
54
|
-
header : "first" | int | list[int] | None, default "first"
|
55
|
-
• "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
|
54
|
+
header : "first" | int | list[int] | list[str] | None, default "first"
|
55
|
+
• "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • list[str] – custom column names.\n • None/False– no header.
|
56
56
|
|
57
57
|
Note: If the header row has a different number of columns than the
|
58
58
|
body rows, the method will automatically fall back to header=None
|
@@ -84,7 +84,11 @@ class TableResult(Sequence):
|
|
84
84
|
|
85
85
|
# Determine default for skip_repeating_headers based on header parameter
|
86
86
|
if skip_repeating_headers is None:
|
87
|
-
skip_repeating_headers =
|
87
|
+
skip_repeating_headers = (
|
88
|
+
header is not None
|
89
|
+
and header is not False
|
90
|
+
and not (isinstance(header, (list, tuple)) and len(header) == 0)
|
91
|
+
)
|
88
92
|
|
89
93
|
# Determine header rows and body rows
|
90
94
|
body = rows
|
@@ -97,10 +101,31 @@ class TableResult(Sequence):
|
|
97
101
|
elif isinstance(header, int):
|
98
102
|
hdr = rows[header]
|
99
103
|
body = rows[:header] + rows[header + 1 :]
|
100
|
-
elif isinstance(header, (list, tuple)):
|
104
|
+
elif isinstance(header, (list, tuple)) and all(isinstance(i, int) for i in header):
|
105
|
+
# List of integers - multi-row header
|
101
106
|
hdr_rows = [rows[i] for i in header]
|
102
107
|
body = [r for idx, r in enumerate(rows) if idx not in header]
|
103
108
|
hdr = hdr_rows
|
109
|
+
elif (
|
110
|
+
isinstance(header, (list, tuple))
|
111
|
+
and len(header) > 0
|
112
|
+
and all(isinstance(i, str) for i in header)
|
113
|
+
):
|
114
|
+
# List of strings - custom column names
|
115
|
+
hdr = list(header)
|
116
|
+
body = rows
|
117
|
+
# Validate column count matches
|
118
|
+
if body:
|
119
|
+
max_cols = max(len(row) for row in body)
|
120
|
+
if len(hdr) != max_cols:
|
121
|
+
raise ValueError(
|
122
|
+
f"Number of column names ({len(hdr)}) must match "
|
123
|
+
f"number of columns in data ({max_cols})"
|
124
|
+
)
|
125
|
+
elif isinstance(header, (list, tuple)) and len(header) == 0:
|
126
|
+
# Empty list behaves like None
|
127
|
+
hdr = None
|
128
|
+
body = rows
|
104
129
|
else:
|
105
130
|
raise ValueError("Invalid value for header parameter")
|
106
131
|
|
@@ -125,7 +150,12 @@ class TableResult(Sequence):
|
|
125
150
|
pass
|
126
151
|
|
127
152
|
# Check for header/body column count mismatch and fallback to no header
|
128
|
-
if
|
153
|
+
if (
|
154
|
+
hdr is not None
|
155
|
+
and body
|
156
|
+
and not (isinstance(header, (list, tuple)) and all(isinstance(i, str) for i in header))
|
157
|
+
):
|
158
|
+
# Skip this check for custom string headers
|
129
159
|
# Get the maximum number of columns from all body rows
|
130
160
|
# This handles cases where some rows have different column counts
|
131
161
|
max_cols = max(len(row) for row in body) if body else 0
|
@@ -144,6 +174,9 @@ class TableResult(Sequence):
|
|
144
174
|
hdr = None
|
145
175
|
body = self._rows # Use all rows as body
|
146
176
|
|
177
|
+
# Handle empty list case - pandas needs None not empty list
|
178
|
+
if isinstance(hdr, list) and len(hdr) == 0:
|
179
|
+
hdr = None
|
147
180
|
df = pd.DataFrame(body, columns=hdr)
|
148
181
|
|
149
182
|
# Convert empty strings to NaN by default
|
@@ -0,0 +1,7 @@
|
|
1
|
+
"""Vision module for visual similarity and pattern matching"""
|
2
|
+
|
3
|
+
from .mixin import VisualSearchMixin
|
4
|
+
from .results import Match, MatchResults
|
5
|
+
from .similarity import VisualMatcher, compute_phash
|
6
|
+
|
7
|
+
__all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]
|
@@ -0,0 +1,209 @@
|
|
1
|
+
"""Mixin to add visual similarity search to Page/PDF/PDFCollection"""
|
2
|
+
|
3
|
+
from typing import List, Optional, Tuple, Union
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
from PIL import Image
|
7
|
+
from tqdm.auto import tqdm
|
8
|
+
|
9
|
+
from .results import Match, MatchResults
|
10
|
+
from .similarity import VisualMatcher, compute_phash
|
11
|
+
|
12
|
+
|
13
|
+
class VisualSearchMixin:
|
14
|
+
"""Add find_similar method to classes that include this mixin"""
|
15
|
+
|
16
|
+
def find_similar(
|
17
|
+
self,
|
18
|
+
examples: Union["Element", "Region", List[Union["Element", "Region"]]],
|
19
|
+
using: str = "vision",
|
20
|
+
confidence: float = 0.6,
|
21
|
+
sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
|
22
|
+
resolution: int = 72,
|
23
|
+
hash_size: int = 20,
|
24
|
+
step_factor: float = 0.1,
|
25
|
+
max_per_page: Optional[int] = None,
|
26
|
+
show_progress: bool = True,
|
27
|
+
**kwargs,
|
28
|
+
) -> MatchResults:
|
29
|
+
"""
|
30
|
+
Find regions visually similar to the given example(s).
|
31
|
+
|
32
|
+
Args:
|
33
|
+
examples: Single element/region or list of examples to search for
|
34
|
+
using: Search method - currently only 'vision' is supported
|
35
|
+
confidence: Minimum similarity score (0-1)
|
36
|
+
sizes: Size variations to search. Can be:
|
37
|
+
- float: ±percentage (e.g., 0.2 = 80%-120%)
|
38
|
+
- tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
|
39
|
+
- tuple(min, max, step): explicit step size
|
40
|
+
- list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
|
41
|
+
resolution: Resolution for image comparison (DPI) (default: 72)
|
42
|
+
hash_size: Size of perceptual hash grid (default: 12)
|
43
|
+
step_factor: Step size as fraction of template size (default: 0.1)
|
44
|
+
max_per_page: Maximum matches to return per page
|
45
|
+
show_progress: Show progress bar for multi-page searches (default: True)
|
46
|
+
**kwargs: Additional options
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
MatchResults collection
|
50
|
+
"""
|
51
|
+
if using != "vision":
|
52
|
+
raise NotImplementedError(f"using='{using}' not yet supported")
|
53
|
+
|
54
|
+
# Ensure examples is a list
|
55
|
+
if not isinstance(examples, list):
|
56
|
+
examples = [examples]
|
57
|
+
|
58
|
+
# Initialize matcher with specified hash size
|
59
|
+
matcher = VisualMatcher(hash_size=hash_size)
|
60
|
+
|
61
|
+
# Prepare templates
|
62
|
+
templates = []
|
63
|
+
for example in examples:
|
64
|
+
# Render the example region/element
|
65
|
+
example_image = example.render(resolution=resolution, crop=True)
|
66
|
+
template_hash = compute_phash(example_image, hash_size=hash_size)
|
67
|
+
templates.append({"image": example_image, "hash": template_hash, "source": example})
|
68
|
+
|
69
|
+
# Get pages to search based on the object type
|
70
|
+
if hasattr(self, "__class__") and self.__class__.__name__ == "PDFCollection":
|
71
|
+
# PDFCollection needs to iterate through all PDFs
|
72
|
+
pages_to_search = []
|
73
|
+
for pdf in self:
|
74
|
+
pages_to_search.extend(pdf.pages)
|
75
|
+
elif hasattr(self, "pages"): # PDF
|
76
|
+
pages_to_search = self.pages
|
77
|
+
elif hasattr(self, "number"): # Single page
|
78
|
+
pages_to_search = [self]
|
79
|
+
else:
|
80
|
+
raise TypeError(f"Cannot search in {type(self)}")
|
81
|
+
|
82
|
+
# Calculate total operations for progress bar
|
83
|
+
total_operations = 0
|
84
|
+
if show_progress:
|
85
|
+
# Get scales that will be searched
|
86
|
+
scales = matcher._get_search_scales(sizes)
|
87
|
+
|
88
|
+
# Pre-calculate for all pages and templates
|
89
|
+
for page in pages_to_search:
|
90
|
+
# Estimate page image size
|
91
|
+
page_w = int(page.width * resolution / 72.0)
|
92
|
+
page_h = int(page.height * resolution / 72.0)
|
93
|
+
|
94
|
+
for template_data in templates:
|
95
|
+
template_w, template_h = template_data["image"].size
|
96
|
+
|
97
|
+
for scale in scales:
|
98
|
+
scaled_w = int(template_w * scale)
|
99
|
+
scaled_h = int(template_h * scale)
|
100
|
+
|
101
|
+
if scaled_w <= page_w and scaled_h <= page_h:
|
102
|
+
step_x = max(1, int(scaled_w * step_factor))
|
103
|
+
step_y = max(1, int(scaled_h * step_factor))
|
104
|
+
|
105
|
+
x_windows = len(range(0, page_w - scaled_w + 1, step_x))
|
106
|
+
y_windows = len(range(0, page_h - scaled_h + 1, step_y))
|
107
|
+
total_operations += x_windows * y_windows
|
108
|
+
|
109
|
+
# Search each page
|
110
|
+
all_matches = []
|
111
|
+
|
112
|
+
# Create single progress bar for all operations
|
113
|
+
progress_bar = None
|
114
|
+
operations_done = 0
|
115
|
+
last_update = 0
|
116
|
+
update_frequency = max(1, total_operations // 1000) # Update at most 1000 times
|
117
|
+
|
118
|
+
if show_progress and total_operations > 0:
|
119
|
+
progress_bar = tqdm(
|
120
|
+
total=total_operations,
|
121
|
+
desc="Searching",
|
122
|
+
unit="window",
|
123
|
+
miniters=update_frequency, # Minimum iterations between updates
|
124
|
+
mininterval=0.1, # Minimum time between updates (seconds)
|
125
|
+
)
|
126
|
+
|
127
|
+
for page_idx, page in enumerate(pages_to_search):
|
128
|
+
# Render the full page once
|
129
|
+
page_image = page.render(resolution=resolution)
|
130
|
+
|
131
|
+
# Convert page coordinates to image coordinates
|
132
|
+
scale = resolution / 72.0 # PDF is 72 DPI
|
133
|
+
|
134
|
+
page_matches = []
|
135
|
+
|
136
|
+
# Search for each template
|
137
|
+
for template_idx, template_data in enumerate(templates):
|
138
|
+
template_image = template_data["image"]
|
139
|
+
template_hash = template_data["hash"]
|
140
|
+
|
141
|
+
# Custom progress callback to update our main progress bar
|
142
|
+
def update_progress():
|
143
|
+
nonlocal operations_done, last_update
|
144
|
+
operations_done += 1
|
145
|
+
|
146
|
+
# Only update progress bar every N operations to avoid overwhelming output
|
147
|
+
if progress_bar and (
|
148
|
+
operations_done - last_update >= update_frequency
|
149
|
+
or operations_done == total_operations
|
150
|
+
):
|
151
|
+
progress_bar.update(operations_done - last_update)
|
152
|
+
last_update = operations_done
|
153
|
+
|
154
|
+
# Update description with current page/template info
|
155
|
+
if len(pages_to_search) > 1:
|
156
|
+
progress_bar.set_description(
|
157
|
+
f"Page {page.number}/{len(pages_to_search)}"
|
158
|
+
)
|
159
|
+
elif len(templates) > 1:
|
160
|
+
progress_bar.set_description(
|
161
|
+
f"Template {template_idx + 1}/{len(templates)}"
|
162
|
+
)
|
163
|
+
|
164
|
+
# Find matches in this page - never show internal progress
|
165
|
+
candidates = matcher.find_matches_in_image(
|
166
|
+
template_image,
|
167
|
+
page_image,
|
168
|
+
template_hash=template_hash,
|
169
|
+
confidence_threshold=confidence,
|
170
|
+
sizes=sizes,
|
171
|
+
step_factor=step_factor,
|
172
|
+
show_progress=False, # We handle progress ourselves
|
173
|
+
progress_callback=update_progress if progress_bar else None,
|
174
|
+
**kwargs,
|
175
|
+
)
|
176
|
+
|
177
|
+
# Convert image coordinates back to PDF coordinates
|
178
|
+
for candidate in candidates:
|
179
|
+
img_x0, img_y0, img_x1, img_y1 = candidate.bbox
|
180
|
+
|
181
|
+
# Convert from image pixels to PDF points
|
182
|
+
# No flipping needed! PDF coordinates map directly to PIL coordinates
|
183
|
+
pdf_x0 = img_x0 / scale
|
184
|
+
pdf_y0 = img_y0 / scale
|
185
|
+
pdf_x1 = img_x1 / scale
|
186
|
+
pdf_y1 = img_y1 / scale
|
187
|
+
|
188
|
+
# Create Match object
|
189
|
+
match = Match(
|
190
|
+
page=page,
|
191
|
+
bbox=(pdf_x0, pdf_y0, pdf_x1, pdf_y1),
|
192
|
+
confidence=candidate.confidence,
|
193
|
+
source_example=template_data["source"],
|
194
|
+
)
|
195
|
+
page_matches.append(match)
|
196
|
+
|
197
|
+
# Apply max_per_page limit if specified
|
198
|
+
if max_per_page and len(page_matches) > max_per_page:
|
199
|
+
# Sort by confidence and take top N
|
200
|
+
page_matches.sort(key=lambda m: m.confidence, reverse=True)
|
201
|
+
page_matches = page_matches[:max_per_page]
|
202
|
+
|
203
|
+
all_matches.extend(page_matches)
|
204
|
+
|
205
|
+
# Close progress bar
|
206
|
+
if progress_bar:
|
207
|
+
progress_bar.close()
|
208
|
+
|
209
|
+
return MatchResults(all_matches)
|