natural-pdf 0.2.12__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/highlighting_service.py +40 -10
- natural_pdf/elements/base.py +18 -1
- natural_pdf/elements/element_collection.py +153 -15
- natural_pdf/elements/rect.py +34 -0
- natural_pdf/elements/region.py +55 -3
- natural_pdf/elements/text.py +20 -2
- natural_pdf/selectors/parser.py +28 -1
- natural_pdf/vision/__init__.py +1 -2
- natural_pdf/vision/mixin.py +67 -27
- natural_pdf/vision/results.py +49 -5
- natural_pdf/vision/similarity.py +195 -23
- natural_pdf/vision/template_matching.py +209 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/RECORD +24 -23
- temp/test_draw_guides.py +25 -0
- temp/test_draw_guides_interactive.py +30 -0
- temp/test_guide_draw_notebook.py +47 -0
- temp/test_inline_js.py +22 -0
- temp/test_widget_functionality.py +68 -0
- temp/test_widget_simple.py +41 -0
- temp/debug_cell_extraction.py +0 -42
- temp/debug_exclusion_overlap.py +0 -43
- temp/debug_exclusions_guides.py +0 -67
- temp/debug_extra_guide.py +0 -41
- temp/debug_outer_boundaries.py +0 -46
- temp/debug_st_search.py +0 -33
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,209 @@
|
|
1
|
+
"""Pure NumPy template matching implementation"""
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import List, Optional, Tuple
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
|
8
|
+
|
9
|
+
@dataclass
|
10
|
+
class TemplateMatch:
|
11
|
+
"""Result of template matching"""
|
12
|
+
|
13
|
+
bbox: Tuple[int, int, int, int] # x0, y0, x1, y1
|
14
|
+
score: float # 0-1, higher is better
|
15
|
+
|
16
|
+
|
17
|
+
class TemplateMatcher:
|
18
|
+
"""Pure NumPy template matching implementation"""
|
19
|
+
|
20
|
+
def __init__(self, method: str = "zncc"):
|
21
|
+
"""
|
22
|
+
Args:
|
23
|
+
method: Matching method
|
24
|
+
- "zncc": Zero-mean Normalized Cross-Correlation (default, recommended)
|
25
|
+
- "ncc": Normalized Cross-Correlation
|
26
|
+
- "ssd": Sum of Squared Differences
|
27
|
+
"""
|
28
|
+
self.method = method
|
29
|
+
|
30
|
+
def match_template(
|
31
|
+
self,
|
32
|
+
image: np.ndarray,
|
33
|
+
template: np.ndarray,
|
34
|
+
step: int = 1,
|
35
|
+
mask_threshold: Optional[float] = None,
|
36
|
+
) -> np.ndarray:
|
37
|
+
"""
|
38
|
+
Compute similarity map between image and template.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
image: Target image (grayscale, normalized 0-1)
|
42
|
+
template: Template to search for (grayscale, normalized 0-1)
|
43
|
+
step: Step size for sliding window (1 = pixel perfect, >1 = faster)
|
44
|
+
mask_threshold: If provided, pixels >= this value in template are masked (ignored).
|
45
|
+
Useful for ignoring white backgrounds (e.g., 0.95 for near-white)
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
2D array of match scores
|
49
|
+
"""
|
50
|
+
if self.method == "zncc":
|
51
|
+
return self._zncc(image, template, step, mask_threshold)
|
52
|
+
elif self.method == "ncc":
|
53
|
+
return self._ncc(image, template, step, mask_threshold)
|
54
|
+
elif self.method == "ssd":
|
55
|
+
return self._ssd(image, template, step, mask_threshold)
|
56
|
+
else:
|
57
|
+
# Default to zncc
|
58
|
+
return self._zncc(image, template, step, mask_threshold)
|
59
|
+
|
60
|
+
def _zncc(
|
61
|
+
self,
|
62
|
+
image: np.ndarray,
|
63
|
+
template: np.ndarray,
|
64
|
+
step: int = 1,
|
65
|
+
mask_threshold: Optional[float] = None,
|
66
|
+
) -> np.ndarray:
|
67
|
+
"""Zero-mean Normalized Cross-Correlation - most robust"""
|
68
|
+
h, w = template.shape
|
69
|
+
img_h, img_w = image.shape
|
70
|
+
|
71
|
+
out_h = (img_h - h) // step + 1
|
72
|
+
out_w = (img_w - w) // step + 1
|
73
|
+
result = np.zeros((out_h, out_w))
|
74
|
+
|
75
|
+
# Create mask if threshold provided
|
76
|
+
if mask_threshold is not None:
|
77
|
+
mask = template < mask_threshold # True for pixels to keep
|
78
|
+
if np.sum(mask) == 0:
|
79
|
+
# All pixels are masked - return zeros
|
80
|
+
return result
|
81
|
+
else:
|
82
|
+
mask = np.ones_like(template, dtype=bool)
|
83
|
+
|
84
|
+
# Precompute template statistics on non-masked pixels
|
85
|
+
masked_template = template[mask]
|
86
|
+
if len(masked_template) == 0:
|
87
|
+
return result
|
88
|
+
|
89
|
+
template_mean = np.mean(masked_template)
|
90
|
+
template_centered = np.zeros_like(template)
|
91
|
+
template_centered[mask] = template[mask] - template_mean
|
92
|
+
template_std = np.sqrt(np.sum(template_centered[mask] ** 2))
|
93
|
+
|
94
|
+
# Handle uniform template case
|
95
|
+
if template_std == 0:
|
96
|
+
# Template has no variation - fall back to checking if means match
|
97
|
+
for i in range(out_h):
|
98
|
+
for j in range(out_w):
|
99
|
+
y = i * step
|
100
|
+
x = j * step
|
101
|
+
window = image[y : y + h, x : x + w]
|
102
|
+
window_masked = window[mask]
|
103
|
+
window_mean = np.mean(window_masked)
|
104
|
+
window_std = np.std(window_masked)
|
105
|
+
|
106
|
+
# Perfect match if window also has same mean and no variation
|
107
|
+
if abs(window_mean - template_mean) < 0.01 and window_std < 0.01:
|
108
|
+
result[i, j] = 1.0
|
109
|
+
return result
|
110
|
+
|
111
|
+
for i in range(out_h):
|
112
|
+
for j in range(out_w):
|
113
|
+
y = i * step
|
114
|
+
x = j * step
|
115
|
+
window = image[y : y + h, x : x + w]
|
116
|
+
|
117
|
+
# Apply mask to window
|
118
|
+
window_masked = window[mask]
|
119
|
+
window_mean = np.mean(window_masked)
|
120
|
+
window_centered = np.zeros_like(window)
|
121
|
+
window_centered[mask] = window[mask] - window_mean
|
122
|
+
window_std = np.sqrt(np.sum(window_centered[mask] ** 2))
|
123
|
+
|
124
|
+
if window_std > 0:
|
125
|
+
correlation = np.sum(window_centered[mask] * template_centered[mask])
|
126
|
+
result[i, j] = correlation / (template_std * window_std)
|
127
|
+
|
128
|
+
return np.clip(result, -1, 1)
|
129
|
+
|
130
|
+
def _ncc(
|
131
|
+
self,
|
132
|
+
image: np.ndarray,
|
133
|
+
template: np.ndarray,
|
134
|
+
step: int = 1,
|
135
|
+
mask_threshold: Optional[float] = None,
|
136
|
+
) -> np.ndarray:
|
137
|
+
"""Normalized Cross-Correlation"""
|
138
|
+
h, w = template.shape
|
139
|
+
img_h, img_w = image.shape
|
140
|
+
|
141
|
+
out_h = (img_h - h) // step + 1
|
142
|
+
out_w = (img_w - w) // step + 1
|
143
|
+
result = np.zeros((out_h, out_w))
|
144
|
+
|
145
|
+
# Create mask if threshold provided
|
146
|
+
if mask_threshold is not None:
|
147
|
+
mask = template < mask_threshold # True for pixels to keep
|
148
|
+
if np.sum(mask) == 0:
|
149
|
+
return result
|
150
|
+
else:
|
151
|
+
mask = np.ones_like(template, dtype=bool)
|
152
|
+
|
153
|
+
template_norm = np.sqrt(np.sum(template[mask] ** 2))
|
154
|
+
if template_norm == 0:
|
155
|
+
return result
|
156
|
+
|
157
|
+
for i in range(out_h):
|
158
|
+
for j in range(out_w):
|
159
|
+
y = i * step
|
160
|
+
x = j * step
|
161
|
+
window = image[y : y + h, x : x + w]
|
162
|
+
|
163
|
+
window_norm = np.sqrt(np.sum(window[mask] ** 2))
|
164
|
+
if window_norm > 0:
|
165
|
+
correlation = np.sum(window[mask] * template[mask])
|
166
|
+
result[i, j] = correlation / (template_norm * window_norm)
|
167
|
+
|
168
|
+
return result
|
169
|
+
|
170
|
+
def _ssd(
|
171
|
+
self,
|
172
|
+
image: np.ndarray,
|
173
|
+
template: np.ndarray,
|
174
|
+
step: int = 1,
|
175
|
+
mask_threshold: Optional[float] = None,
|
176
|
+
) -> np.ndarray:
|
177
|
+
"""Sum of Squared Differences - converted to similarity score"""
|
178
|
+
h, w = template.shape
|
179
|
+
img_h, img_w = image.shape
|
180
|
+
|
181
|
+
out_h = (img_h - h) // step + 1
|
182
|
+
out_w = (img_w - w) // step + 1
|
183
|
+
result = np.zeros((out_h, out_w))
|
184
|
+
|
185
|
+
# Create mask if threshold provided
|
186
|
+
if mask_threshold is not None:
|
187
|
+
mask = template < mask_threshold # True for pixels to keep
|
188
|
+
if np.sum(mask) == 0:
|
189
|
+
return result
|
190
|
+
else:
|
191
|
+
mask = np.ones_like(template, dtype=bool)
|
192
|
+
|
193
|
+
# Number of valid pixels for normalization
|
194
|
+
n_valid = np.sum(mask)
|
195
|
+
if n_valid == 0:
|
196
|
+
return result
|
197
|
+
|
198
|
+
for i in range(out_h):
|
199
|
+
for j in range(out_w):
|
200
|
+
y = i * step
|
201
|
+
x = j * step
|
202
|
+
window = image[y : y + h, x : x + w]
|
203
|
+
|
204
|
+
# Only compute SSD on non-masked pixels
|
205
|
+
diff = window - template
|
206
|
+
ssd = np.sum((diff[mask]) ** 2) / n_valid
|
207
|
+
result[i, j] = 1.0 / (1.0 + ssd) # Convert to similarity
|
208
|
+
|
209
|
+
return result
|
@@ -26,7 +26,7 @@ natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZD
|
|
26
26
|
natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=KPuKM7SstfErTkRnGq4vrgE0Tv8iazN13Jp7yAXGKso,55575
|
29
|
-
natural_pdf/core/highlighting_service.py,sha256=
|
29
|
+
natural_pdf/core/highlighting_service.py,sha256=wEV-koqHoHf7S3wZ3j8D2L-ucGp3Nd0YhhStz9yqeLc,70406
|
30
30
|
natural_pdf/core/page.py,sha256=Pid5hqVjcyX-gcCzxCJ62k6AQhNbUMNM_5QmEcylIjM,155264
|
31
31
|
natural_pdf/core/page_collection.py,sha256=IjdFq9q0D0P6ZKWInf0H25rLzxfMb7RsUXucogkhNkU,63169
|
32
32
|
natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
|
@@ -39,13 +39,13 @@ natural_pdf/describe/elements.py,sha256=3Y541z5TQ2obrfZFiFi1YQMsCt3oYrhMHpD5j1tu
|
|
39
39
|
natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
|
40
40
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
41
41
|
natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
|
42
|
-
natural_pdf/elements/base.py,sha256=
|
43
|
-
natural_pdf/elements/element_collection.py,sha256=
|
42
|
+
natural_pdf/elements/base.py,sha256=iJHEejlYu8RNvlKYK2UHAnAlz6tXkiEaGnG2xYtVnuU,59635
|
43
|
+
natural_pdf/elements/element_collection.py,sha256=CVfnprzKTLeGSpvhGL2ZQVzZ8veSoFtCBlSSGDmX5lY,136594
|
44
44
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
45
45
|
natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
|
46
|
-
natural_pdf/elements/rect.py,sha256=
|
47
|
-
natural_pdf/elements/region.py,sha256=
|
48
|
-
natural_pdf/elements/text.py,sha256=
|
46
|
+
natural_pdf/elements/rect.py,sha256=kmUmhwnihd-aTweAO-LsngRDo5Iqmx7lcSa8ZBlE_2E,4544
|
47
|
+
natural_pdf/elements/region.py,sha256=DM8o0ptm86B2ouOqDgUK_av9cCN6G5iIlJ1VC3KfVWk,167379
|
48
|
+
natural_pdf/elements/text.py,sha256=dOiss-cSBYnK9j7KqmqmvJcCidBcIhckLJCW8lVz2es,21210
|
49
49
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
50
50
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
51
51
|
natural_pdf/exporters/base.py,sha256=379sioW_hbkGb21sEVuJhbkkDO5MFsFtTUNO5TgG2YU,2101
|
@@ -85,7 +85,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
|
|
85
85
|
natural_pdf/search/search_service_protocol.py,sha256=u8pbuWP96fnQEe6mnreY9DrdiDAHP6ZCY7phvSbFlP8,6697
|
86
86
|
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
87
87
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
88
|
-
natural_pdf/selectors/parser.py,sha256=
|
88
|
+
natural_pdf/selectors/parser.py,sha256=WOoJlCxyz6F50uWJEBWSucANWMvihroezMBZMPCZxzE,40388
|
89
89
|
natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
|
90
90
|
natural_pdf/tables/result.py,sha256=-8ctA-jCJYSHtlfAoqTvhUwO5zSP2BQxxetAjqEsNyg,8665
|
91
91
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
@@ -101,38 +101,39 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
|
|
101
101
|
natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
|
102
102
|
natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
|
103
103
|
natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
|
104
|
-
natural_pdf/vision/__init__.py,sha256=
|
105
|
-
natural_pdf/vision/mixin.py,sha256=
|
106
|
-
natural_pdf/vision/results.py,sha256=
|
107
|
-
natural_pdf/vision/similarity.py,sha256=
|
104
|
+
natural_pdf/vision/__init__.py,sha256=TkoQtdODlh0n_99dsjLIWKE9dgK0m4jfrui_cQ3gTwU,221
|
105
|
+
natural_pdf/vision/mixin.py,sha256=wlsX42cFUnUepZHsEfKBqXiDEPUwBG6-KN2Cx5qz_lw,10812
|
106
|
+
natural_pdf/vision/results.py,sha256=_NBRCKtDd1M3sWK7zHSym7-jpQqW4kR_iFFL4PvnBNo,6649
|
107
|
+
natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2ZzU,17925
|
108
|
+
natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
|
108
109
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
109
110
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
110
|
-
natural_pdf-0.2.
|
111
|
+
natural_pdf-0.2.15.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
111
112
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
112
113
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
113
114
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
114
115
|
optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
|
115
116
|
optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
|
116
|
-
temp/debug_cell_extraction.py,sha256=nE0Z470P40v8xZfWO1V3qgNaejs_pernEQaUOFeOJ1U,1527
|
117
|
-
temp/debug_exclusion_overlap.py,sha256=RptJXwqBXy5gsvMF037KEx1o2QgjwEDkMB6TD5aJdqA,1644
|
118
|
-
temp/debug_exclusions_guides.py,sha256=s8siep9te1KRJ2j0vH1tvDQnBlz7PKbHeCiYMrZL8jE,2096
|
119
|
-
temp/debug_extra_guide.py,sha256=95Tim-YnmAR4kICw2XDKVDvlW5WsjK_51cv5-EV11rc,1236
|
120
|
-
temp/debug_outer_boundaries.py,sha256=uJUJwojTxOU4VtbGUouuhV65IYzS6NDIVKxnS7o64nU,1456
|
121
|
-
temp/debug_st_search.py,sha256=F4c_mUVi_d5AKaKIpQ0AnW1amDqAwALoQQj7wZj--J0,1021
|
122
117
|
temp/fix_page_exclusions.py,sha256=YIj62zF38TdoBARAuSIvEbetl_JfXG-mp4v9p355qmo,1358
|
118
|
+
temp/test_draw_guides.py,sha256=_eSSBElGHQkd2QD_KA_Okw70v0dlY5m-1-C5SQwKAJw,642
|
119
|
+
temp/test_draw_guides_interactive.py,sha256=FsH-2ZQGsGx_8QfVCWUAkLbOcJz-VfiwROzQD4AD7kQ,926
|
123
120
|
temp/test_exclusion_with_debug.py,sha256=CScxHvb43KrB5dzXuTOhuzjcBXZBdfYB5ygiKkEW26g,1393
|
124
121
|
temp/test_find_exclusions_fix.py,sha256=1l5aEqnElcl3kiykdtmJFlVxQ1xMKGm1UckGYEQg--c,2103
|
125
122
|
temp/test_find_exclusions_fix_no_recursion.py,sha256=qZspTBwxunRM93N_-fZ2fR5Lodj0ArQX3h10HlTXhfc,3592
|
126
123
|
temp/test_fix_real_pdf.py,sha256=uuylxmpeAEbIix9wjl0Gri1sZlN61dBWTq6ZCyfvzF8,1454
|
127
124
|
temp/test_fix_working.py,sha256=-Ryre1rXYA2EG_lmPZGYEGi8yz0slhHEXPJMYexZW84,1750
|
128
125
|
temp/test_fixed_pdf_exclusions.py,sha256=Q5zxooKDvtTXo-dDsx3nsQw1ZVHX3TW47iZ_dXpFdrY,2168
|
126
|
+
temp/test_guide_draw_notebook.py,sha256=9yYRV5mfmVHiL1lnwNj-vksw45d1oWbAZpDGA7yZf-M,1583
|
129
127
|
temp/test_horizontal_top_bottom.py,sha256=Mb3tjt9Z3wOTpzFOgK7i0K-j-_ynNh4vDu2x1L3nu-s,2163
|
128
|
+
temp/test_inline_js.py,sha256=xuQH8VQn7L4sogv6wd_Rwudx5p_Lt6we1h7U1LPTH-g,646
|
130
129
|
temp/test_marker_order.py,sha256=TFZkMxRiNoZGVcdDivYnkIDNvwHaiyKUdYoy2rTTIiI,1417
|
131
130
|
temp/test_original_exclusions_now_work.py,sha256=G6LmaF-P9Qhj0j4lT_4ncfCddllfP6L8F_x2prUBr9w,1904
|
132
131
|
temp/test_pdf_exclusions_with_guides.py,sha256=QaMl0frgKC8kCPQ2BUI8kqyvqsIjQPXKV_St1rK3zxg,2754
|
133
132
|
temp/test_region_exclusions_detailed.py,sha256=EftdW3JY3JH_LX5QlWKt-4drM-joPggK2fKUZRXVTMA,814
|
134
133
|
temp/test_stripes_real_pdf.py,sha256=FIvDoJrnuioOMw1A0aTCCfZLeg99lusfe0Fb0MiqnhQ,2618
|
135
134
|
temp/test_vertical_stripes.py,sha256=Yf3TJfb_faqAFzlgb7i5u6dDHjF4UMSHIGM99vangRk,1877
|
135
|
+
temp/test_widget_functionality.py,sha256=jsEGHYK1dWWa8uEcfGRRj1ReHRMzNoIaMZU4d-o-Djs,2448
|
136
|
+
temp/test_widget_simple.py,sha256=Vy_DKgPhPhUQ8nKw_KnhGTpwtmh5EEic0avEyW9hbOQ,1398
|
136
137
|
tools/bad_pdf_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
137
138
|
tools/bad_pdf_eval/analyser.py,sha256=oqSTo3NLyignp_XdCO9_SRCUUXMU8lfgDavKYZYNxws,13690
|
138
139
|
tools/bad_pdf_eval/collate_summaries.py,sha256=L_YsdiqmwGIHYWTVJqo6gyazyn3GIQgpfGGKk8uwckk,5159
|
@@ -144,8 +145,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
144
145
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
145
146
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
146
147
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
147
|
-
natural_pdf-0.2.
|
148
|
-
natural_pdf-0.2.
|
149
|
-
natural_pdf-0.2.
|
150
|
-
natural_pdf-0.2.
|
151
|
-
natural_pdf-0.2.
|
148
|
+
natural_pdf-0.2.15.dist-info/METADATA,sha256=TfUHRd_THs6bk7Iwl8UdBjZfnWEXKg_tBiVn_Q9tv28,6960
|
149
|
+
natural_pdf-0.2.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
150
|
+
natural_pdf-0.2.15.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
151
|
+
natural_pdf-0.2.15.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
|
152
|
+
natural_pdf-0.2.15.dist-info/RECORD,,
|
temp/test_draw_guides.py
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
"""Example usage of the interactive guide drawing feature"""
|
2
|
+
|
3
|
+
# In a Jupyter notebook:
|
4
|
+
from natural_pdf import NaturalPDF
|
5
|
+
|
6
|
+
# Load a PDF
|
7
|
+
pdf = NaturalPDF.from_file("your_pdf.pdf")
|
8
|
+
page = pdf[0]
|
9
|
+
|
10
|
+
# Create guides
|
11
|
+
guides = page.guides()
|
12
|
+
|
13
|
+
# Detect some initial guides (optional)
|
14
|
+
guides.vertical.from_lines(n=5)
|
15
|
+
guides.horizontal.from_lines(n=5)
|
16
|
+
|
17
|
+
# Open interactive editor for vertical guides
|
18
|
+
guides.vertical.draw()
|
19
|
+
|
20
|
+
# Open interactive editor for horizontal guides
|
21
|
+
guides.horizontal.draw(width=600) # Smaller widget
|
22
|
+
|
23
|
+
# After editing, the guides are automatically updated
|
24
|
+
# You can now use them to extract tables:
|
25
|
+
table = page.extract_table(guides)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
"""Test the interactive guide drawing functionality"""
|
2
|
+
|
3
|
+
from natural_pdf.core.pdf import PDF
|
4
|
+
from natural_pdf.analyzers.guides import Guides
|
5
|
+
|
6
|
+
# Load a sample PDF
|
7
|
+
pdf = PDF("tests/sample_pdfs/simple_table.pdf")
|
8
|
+
page = pdf.pages[0]
|
9
|
+
|
10
|
+
# Create guides
|
11
|
+
guides = Guides(page)
|
12
|
+
|
13
|
+
# Add some initial guides for testing
|
14
|
+
guides.vertical.from_content()
|
15
|
+
guides.horizontal.from_lines(n=5)
|
16
|
+
|
17
|
+
print("Initial vertical guides:", list(guides.vertical))
|
18
|
+
print("Initial horizontal guides:", list(guides.horizontal))
|
19
|
+
|
20
|
+
# This would open the interactive widget in Jupyter
|
21
|
+
# guides.vertical.draw()
|
22
|
+
|
23
|
+
# For non-Jupyter testing, we can check the method exists
|
24
|
+
assert hasattr(guides.vertical, 'draw')
|
25
|
+
assert callable(guides.vertical.draw)
|
26
|
+
|
27
|
+
print("\nSuccess! The draw() method is available on GuidesList objects.")
|
28
|
+
print("To use it interactively, run this in a Jupyter notebook:")
|
29
|
+
print(" guides.vertical.draw()")
|
30
|
+
print(" guides.horizontal.draw(width=600)")
|
@@ -0,0 +1,47 @@
|
|
1
|
+
"""Test script to verify the draw() method works"""
|
2
|
+
|
3
|
+
import sys
|
4
|
+
sys.path.insert(0, '.')
|
5
|
+
|
6
|
+
from natural_pdf.analyzers.guides import GuidesList, Guides
|
7
|
+
|
8
|
+
# Create a mock context for testing
|
9
|
+
class MockContext:
|
10
|
+
def __init__(self):
|
11
|
+
self.width = 600
|
12
|
+
self.height = 800
|
13
|
+
|
14
|
+
def render(self, resolution=150):
|
15
|
+
# Create a simple test image
|
16
|
+
from PIL import Image
|
17
|
+
img = Image.new('RGB', (int(self.width * resolution/72), int(self.height * resolution/72)), 'white')
|
18
|
+
return img
|
19
|
+
|
20
|
+
# Test that the draw method exists
|
21
|
+
mock_context = MockContext()
|
22
|
+
guides = Guides(mock_context)
|
23
|
+
|
24
|
+
# Add some test guides
|
25
|
+
guides.vertical.data = [100, 200, 300, 400, 500]
|
26
|
+
guides.horizontal.data = [150, 350, 550, 750]
|
27
|
+
|
28
|
+
print("Initial vertical guides:", list(guides.vertical))
|
29
|
+
print("Initial horizontal guides:", list(guides.horizontal))
|
30
|
+
|
31
|
+
# Check that draw method exists
|
32
|
+
assert hasattr(guides.vertical, 'draw')
|
33
|
+
assert callable(guides.vertical.draw)
|
34
|
+
assert hasattr(guides.horizontal, 'draw')
|
35
|
+
assert callable(guides.horizontal.draw)
|
36
|
+
|
37
|
+
print("\nSuccess! The draw() method is available.")
|
38
|
+
print("\nIn a Jupyter notebook, you would use:")
|
39
|
+
print(" guides.vertical.draw() # Interactive vertical guide editor")
|
40
|
+
print(" guides.horizontal.draw() # Interactive horizontal guide editor")
|
41
|
+
print("\nFeatures:")
|
42
|
+
print(" - Click to add new guides")
|
43
|
+
print(" - Click existing guides to select them")
|
44
|
+
print(" - Drag to move guides")
|
45
|
+
print(" - Delete key to remove selected guide")
|
46
|
+
print(" - Arrow keys to fine-tune position")
|
47
|
+
print(" - Enter to apply, Escape to cancel")
|
temp/test_inline_js.py
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
"""Test inline JavaScript in HTML widget"""
|
2
|
+
|
3
|
+
import ipywidgets as widgets
|
4
|
+
from IPython.display import display
|
5
|
+
|
6
|
+
# Create an HTML widget with inline JavaScript
|
7
|
+
html_content = '''
|
8
|
+
<div id="test-div">Click me!</div>
|
9
|
+
<script type="text/javascript">
|
10
|
+
document.getElementById('test-div').addEventListener('click', function() {
|
11
|
+
alert('Clicked!');
|
12
|
+
this.innerHTML = 'Clicked at ' + new Date().toLocaleTimeString();
|
13
|
+
});
|
14
|
+
console.log('JavaScript is running!');
|
15
|
+
</script>
|
16
|
+
'''
|
17
|
+
|
18
|
+
# Display using widgets.HTML
|
19
|
+
html_widget = widgets.HTML(value=html_content)
|
20
|
+
display(html_widget)
|
21
|
+
|
22
|
+
print("If you see 'Click me!' above and can click it, JavaScript is working.")
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
"""Test the guide widget functionality"""
|
3
|
+
|
4
|
+
import sys
|
5
|
+
import os
|
6
|
+
|
7
|
+
# Add parent directory to path
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
# Test importing and basic functionality
|
11
|
+
try:
|
12
|
+
from natural_pdf.analyzers.guides import InteractiveGuideWidget, GuidesList, _GUIDE_WIDGET_AVAILABLE
|
13
|
+
print("✓ Successfully imported InteractiveGuideWidget")
|
14
|
+
|
15
|
+
if _GUIDE_WIDGET_AVAILABLE:
|
16
|
+
print("✓ ipywidgets is available")
|
17
|
+
|
18
|
+
# Create a mock GuidesList for testing
|
19
|
+
class MockPage:
|
20
|
+
def __init__(self):
|
21
|
+
self.bbox = (0, 0, 595, 842) # A4 page size in points
|
22
|
+
|
23
|
+
def render(self, resolution=150):
|
24
|
+
# Mock render method
|
25
|
+
from PIL import Image
|
26
|
+
width = int(595 * resolution / 72)
|
27
|
+
height = int(842 * resolution / 72)
|
28
|
+
return Image.new('RGB', (width, height), color='white')
|
29
|
+
|
30
|
+
class MockGuides:
|
31
|
+
def __init__(self):
|
32
|
+
self.context = MockPage()
|
33
|
+
|
34
|
+
class MockGuidesList:
|
35
|
+
def __init__(self):
|
36
|
+
self.data = [100, 200, 300]
|
37
|
+
self._axis = 'vertical'
|
38
|
+
self._parent = MockGuides()
|
39
|
+
|
40
|
+
# Test creating the widget
|
41
|
+
mock_guides = MockGuidesList()
|
42
|
+
try:
|
43
|
+
widget = InteractiveGuideWidget(mock_guides)
|
44
|
+
print("✓ Successfully created InteractiveGuideWidget instance")
|
45
|
+
print(f" - Widget ID: {widget.widget_id}")
|
46
|
+
print(f" - Widget base classes: {InteractiveGuideWidget.__bases__}")
|
47
|
+
|
48
|
+
# Check if the widget has the expected methods
|
49
|
+
expected_methods = ['_generate_content', 'update_guides']
|
50
|
+
for method in expected_methods:
|
51
|
+
if hasattr(widget, method):
|
52
|
+
print(f" - Has method: {method}")
|
53
|
+
else:
|
54
|
+
print(f" - Missing method: {method}")
|
55
|
+
|
56
|
+
except Exception as e:
|
57
|
+
print(f"✗ Error creating widget: {e}")
|
58
|
+
|
59
|
+
else:
|
60
|
+
print("⚠ ipywidgets not available - widget functionality disabled")
|
61
|
+
|
62
|
+
except ImportError as e:
|
63
|
+
print(f"✗ Import error: {e}")
|
64
|
+
|
65
|
+
except Exception as e:
|
66
|
+
print(f"✗ Unexpected error: {e}")
|
67
|
+
|
68
|
+
print("\nWidget implementation test complete!")
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
"""Simple test for the guide widget"""
|
3
|
+
|
4
|
+
import sys
|
5
|
+
import os
|
6
|
+
|
7
|
+
# Add parent directory to path
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
# Test importing the module
|
11
|
+
try:
|
12
|
+
from natural_pdf.analyzers.guides import InteractiveGuideWidget, _GUIDE_WIDGET_AVAILABLE
|
13
|
+
print(f"✓ Module imported successfully")
|
14
|
+
print(f"✓ Widget available: {_GUIDE_WIDGET_AVAILABLE}")
|
15
|
+
|
16
|
+
if _GUIDE_WIDGET_AVAILABLE:
|
17
|
+
print("✓ ipywidgets is installed and InteractiveGuideWidget is available")
|
18
|
+
else:
|
19
|
+
print("✗ ipywidgets is not installed")
|
20
|
+
|
21
|
+
except ImportError as e:
|
22
|
+
print(f"✗ Import error: {e}")
|
23
|
+
sys.exit(1)
|
24
|
+
|
25
|
+
# Check if we can create the widget class
|
26
|
+
if _GUIDE_WIDGET_AVAILABLE:
|
27
|
+
try:
|
28
|
+
# We can't actually instantiate it without a GuidesList, but we can check the class exists
|
29
|
+
print(f"✓ InteractiveGuideWidget class: {InteractiveGuideWidget}")
|
30
|
+
print(f"✓ Widget base classes: {InteractiveGuideWidget.__bases__}")
|
31
|
+
|
32
|
+
# Check methods
|
33
|
+
methods = [m for m in dir(InteractiveGuideWidget) if not m.startswith('_')]
|
34
|
+
print(f"✓ Public methods: {methods}")
|
35
|
+
|
36
|
+
except Exception as e:
|
37
|
+
print(f"✗ Error checking widget class: {e}")
|
38
|
+
else:
|
39
|
+
print("⚠ Skipping widget checks as ipywidgets is not available")
|
40
|
+
|
41
|
+
print("\nAll checks passed!")
|
temp/debug_cell_extraction.py
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
"""Debug cell text extraction with exclusions"""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
from natural_pdf.analyzers.guides import Guides
|
4
|
-
|
5
|
-
pdf = PDF("pdfs/m27.pdf")
|
6
|
-
page = pdf.pages[0]
|
7
|
-
|
8
|
-
# Add exclusions
|
9
|
-
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
10
|
-
|
11
|
-
# Check exclusions are registered
|
12
|
-
print("Exclusions on page:")
|
13
|
-
exclusions = page._get_exclusion_regions(debug=True)
|
14
|
-
|
15
|
-
# Create guides and build grid
|
16
|
-
headers = page.find(text="NUMBER").right(include_source=True).expand(top=3, bottom=3).find_all('text')
|
17
|
-
guides = Guides(page)
|
18
|
-
guides.vertical.from_content(headers, align='left')
|
19
|
-
guides.horizontal.from_stripes()
|
20
|
-
|
21
|
-
# Build grid and get cells
|
22
|
-
grid_result = guides.build_grid(include_outer_boundaries=True)
|
23
|
-
cells = grid_result["regions"]["cells"]
|
24
|
-
|
25
|
-
print(f"\nTotal cells: {len(cells)}")
|
26
|
-
|
27
|
-
# Check first row cells (these should be in excluded area)
|
28
|
-
first_row_cells = [c for c in cells if c.bbox[1] < 90] # y < 90
|
29
|
-
print(f"\nFirst row cells: {len(first_row_cells)}")
|
30
|
-
|
31
|
-
for i, cell in enumerate(first_row_cells[:3]):
|
32
|
-
print(f"\nCell {i}:")
|
33
|
-
print(f" Bbox: {cell.bbox}")
|
34
|
-
print(f" Raw text: {repr(cell.extract_text(apply_exclusions=False))}")
|
35
|
-
print(f" With exclusions: {repr(cell.extract_text(apply_exclusions=True))}")
|
36
|
-
|
37
|
-
# Now test the full table extraction
|
38
|
-
print("\n\nFull table extraction:")
|
39
|
-
result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
40
|
-
df = result.to_df()
|
41
|
-
print("\nFirst row of dataframe:")
|
42
|
-
print(df.iloc[0].to_dict() if not df.empty else "Empty")
|
temp/debug_exclusion_overlap.py
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
"""Debug how exclusions work with overlapping regions"""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
from natural_pdf.analyzers.guides import Guides
|
4
|
-
|
5
|
-
pdf = PDF("pdfs/m27.pdf")
|
6
|
-
page = pdf.pages[0]
|
7
|
-
|
8
|
-
# Add exclusion
|
9
|
-
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
10
|
-
|
11
|
-
# Get the exclusion region
|
12
|
-
exclusions = page._get_exclusion_regions()
|
13
|
-
excl_region = exclusions[0]
|
14
|
-
print(f"Exclusion region: {excl_region.bbox}")
|
15
|
-
print(f"Exclusion bottom: {excl_region.bbox[3]}")
|
16
|
-
|
17
|
-
# Create a test cell that overlaps the exclusion
|
18
|
-
# Cell 1 from before: (32.06, 0.5, 73.18288, 79.53999999999996)
|
19
|
-
test_cell = page.region(32.06, 0.5, 73.18288, 79.53999999999996)
|
20
|
-
|
21
|
-
print(f"\nTest cell: {test_cell.bbox}")
|
22
|
-
print(f"Cell overlaps exclusion: top={test_cell.bbox[1]} < excl_bottom={excl_region.bbox[3]}")
|
23
|
-
|
24
|
-
# Extract text from different y-ranges
|
25
|
-
print("\nText in different parts of the cell:")
|
26
|
-
|
27
|
-
# Part above exclusion line (should be empty)
|
28
|
-
upper_part = page.region(32.06, 0.5, 73.18288, 59.12)
|
29
|
-
print(f"Upper part (0.5 to 59.12): '{upper_part.extract_text(apply_exclusions=True)}'")
|
30
|
-
|
31
|
-
# Part below exclusion line (should have text)
|
32
|
-
lower_part = page.region(32.06, 59.12, 73.18288, 79.54)
|
33
|
-
print(f"Lower part (59.12 to 79.54): '{lower_part.extract_text()}'")
|
34
|
-
|
35
|
-
# The whole cell
|
36
|
-
print(f"Whole cell with exclusions: '{test_cell.extract_text(apply_exclusions=True)}'")
|
37
|
-
print(f"Whole cell without exclusions: '{test_cell.extract_text(apply_exclusions=False)}'")
|
38
|
-
|
39
|
-
# Check what text elements are in this region
|
40
|
-
print("\nText elements in cell:")
|
41
|
-
cell_texts = test_cell.find_all('text')
|
42
|
-
for t in cell_texts[:5]:
|
43
|
-
print(f" '{t.text}' at y={t.top:.2f}-{t.bottom:.2f}")
|