natural-pdf 0.2.12__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,209 @@
1
+ """Pure NumPy template matching implementation"""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional, Tuple
5
+
6
+ import numpy as np
7
+
8
+
9
+ @dataclass
10
+ class TemplateMatch:
11
+ """Result of template matching"""
12
+
13
+ bbox: Tuple[int, int, int, int] # x0, y0, x1, y1
14
+ score: float # 0-1, higher is better
15
+
16
+
17
+ class TemplateMatcher:
18
+ """Pure NumPy template matching implementation"""
19
+
20
+ def __init__(self, method: str = "zncc"):
21
+ """
22
+ Args:
23
+ method: Matching method
24
+ - "zncc": Zero-mean Normalized Cross-Correlation (default, recommended)
25
+ - "ncc": Normalized Cross-Correlation
26
+ - "ssd": Sum of Squared Differences
27
+ """
28
+ self.method = method
29
+
30
+ def match_template(
31
+ self,
32
+ image: np.ndarray,
33
+ template: np.ndarray,
34
+ step: int = 1,
35
+ mask_threshold: Optional[float] = None,
36
+ ) -> np.ndarray:
37
+ """
38
+ Compute similarity map between image and template.
39
+
40
+ Args:
41
+ image: Target image (grayscale, normalized 0-1)
42
+ template: Template to search for (grayscale, normalized 0-1)
43
+ step: Step size for sliding window (1 = pixel perfect, >1 = faster)
44
+ mask_threshold: If provided, pixels >= this value in template are masked (ignored).
45
+ Useful for ignoring white backgrounds (e.g., 0.95 for near-white)
46
+
47
+ Returns:
48
+ 2D array of match scores
49
+ """
50
+ if self.method == "zncc":
51
+ return self._zncc(image, template, step, mask_threshold)
52
+ elif self.method == "ncc":
53
+ return self._ncc(image, template, step, mask_threshold)
54
+ elif self.method == "ssd":
55
+ return self._ssd(image, template, step, mask_threshold)
56
+ else:
57
+ # Default to zncc
58
+ return self._zncc(image, template, step, mask_threshold)
59
+
60
+ def _zncc(
61
+ self,
62
+ image: np.ndarray,
63
+ template: np.ndarray,
64
+ step: int = 1,
65
+ mask_threshold: Optional[float] = None,
66
+ ) -> np.ndarray:
67
+ """Zero-mean Normalized Cross-Correlation - most robust"""
68
+ h, w = template.shape
69
+ img_h, img_w = image.shape
70
+
71
+ out_h = (img_h - h) // step + 1
72
+ out_w = (img_w - w) // step + 1
73
+ result = np.zeros((out_h, out_w))
74
+
75
+ # Create mask if threshold provided
76
+ if mask_threshold is not None:
77
+ mask = template < mask_threshold # True for pixels to keep
78
+ if np.sum(mask) == 0:
79
+ # All pixels are masked - return zeros
80
+ return result
81
+ else:
82
+ mask = np.ones_like(template, dtype=bool)
83
+
84
+ # Precompute template statistics on non-masked pixels
85
+ masked_template = template[mask]
86
+ if len(masked_template) == 0:
87
+ return result
88
+
89
+ template_mean = np.mean(masked_template)
90
+ template_centered = np.zeros_like(template)
91
+ template_centered[mask] = template[mask] - template_mean
92
+ template_std = np.sqrt(np.sum(template_centered[mask] ** 2))
93
+
94
+ # Handle uniform template case
95
+ if template_std == 0:
96
+ # Template has no variation - fall back to checking if means match
97
+ for i in range(out_h):
98
+ for j in range(out_w):
99
+ y = i * step
100
+ x = j * step
101
+ window = image[y : y + h, x : x + w]
102
+ window_masked = window[mask]
103
+ window_mean = np.mean(window_masked)
104
+ window_std = np.std(window_masked)
105
+
106
+ # Perfect match if window also has same mean and no variation
107
+ if abs(window_mean - template_mean) < 0.01 and window_std < 0.01:
108
+ result[i, j] = 1.0
109
+ return result
110
+
111
+ for i in range(out_h):
112
+ for j in range(out_w):
113
+ y = i * step
114
+ x = j * step
115
+ window = image[y : y + h, x : x + w]
116
+
117
+ # Apply mask to window
118
+ window_masked = window[mask]
119
+ window_mean = np.mean(window_masked)
120
+ window_centered = np.zeros_like(window)
121
+ window_centered[mask] = window[mask] - window_mean
122
+ window_std = np.sqrt(np.sum(window_centered[mask] ** 2))
123
+
124
+ if window_std > 0:
125
+ correlation = np.sum(window_centered[mask] * template_centered[mask])
126
+ result[i, j] = correlation / (template_std * window_std)
127
+
128
+ return np.clip(result, -1, 1)
129
+
130
+ def _ncc(
131
+ self,
132
+ image: np.ndarray,
133
+ template: np.ndarray,
134
+ step: int = 1,
135
+ mask_threshold: Optional[float] = None,
136
+ ) -> np.ndarray:
137
+ """Normalized Cross-Correlation"""
138
+ h, w = template.shape
139
+ img_h, img_w = image.shape
140
+
141
+ out_h = (img_h - h) // step + 1
142
+ out_w = (img_w - w) // step + 1
143
+ result = np.zeros((out_h, out_w))
144
+
145
+ # Create mask if threshold provided
146
+ if mask_threshold is not None:
147
+ mask = template < mask_threshold # True for pixels to keep
148
+ if np.sum(mask) == 0:
149
+ return result
150
+ else:
151
+ mask = np.ones_like(template, dtype=bool)
152
+
153
+ template_norm = np.sqrt(np.sum(template[mask] ** 2))
154
+ if template_norm == 0:
155
+ return result
156
+
157
+ for i in range(out_h):
158
+ for j in range(out_w):
159
+ y = i * step
160
+ x = j * step
161
+ window = image[y : y + h, x : x + w]
162
+
163
+ window_norm = np.sqrt(np.sum(window[mask] ** 2))
164
+ if window_norm > 0:
165
+ correlation = np.sum(window[mask] * template[mask])
166
+ result[i, j] = correlation / (template_norm * window_norm)
167
+
168
+ return result
169
+
170
+ def _ssd(
171
+ self,
172
+ image: np.ndarray,
173
+ template: np.ndarray,
174
+ step: int = 1,
175
+ mask_threshold: Optional[float] = None,
176
+ ) -> np.ndarray:
177
+ """Sum of Squared Differences - converted to similarity score"""
178
+ h, w = template.shape
179
+ img_h, img_w = image.shape
180
+
181
+ out_h = (img_h - h) // step + 1
182
+ out_w = (img_w - w) // step + 1
183
+ result = np.zeros((out_h, out_w))
184
+
185
+ # Create mask if threshold provided
186
+ if mask_threshold is not None:
187
+ mask = template < mask_threshold # True for pixels to keep
188
+ if np.sum(mask) == 0:
189
+ return result
190
+ else:
191
+ mask = np.ones_like(template, dtype=bool)
192
+
193
+ # Number of valid pixels for normalization
194
+ n_valid = np.sum(mask)
195
+ if n_valid == 0:
196
+ return result
197
+
198
+ for i in range(out_h):
199
+ for j in range(out_w):
200
+ y = i * step
201
+ x = j * step
202
+ window = image[y : y + h, x : x + w]
203
+
204
+ # Only compute SSD on non-masked pixels
205
+ diff = window - template
206
+ ssd = np.sum((diff[mask]) ** 2) / n_valid
207
+ result[i, j] = 1.0 / (1.0 + ssd) # Convert to similarity
208
+
209
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.12
3
+ Version: 0.2.15
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -26,7 +26,7 @@ natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZD
26
26
  natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
28
  natural_pdf/core/element_manager.py,sha256=KPuKM7SstfErTkRnGq4vrgE0Tv8iazN13Jp7yAXGKso,55575
29
- natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
29
+ natural_pdf/core/highlighting_service.py,sha256=wEV-koqHoHf7S3wZ3j8D2L-ucGp3Nd0YhhStz9yqeLc,70406
30
30
  natural_pdf/core/page.py,sha256=Pid5hqVjcyX-gcCzxCJ62k6AQhNbUMNM_5QmEcylIjM,155264
31
31
  natural_pdf/core/page_collection.py,sha256=IjdFq9q0D0P6ZKWInf0H25rLzxfMb7RsUXucogkhNkU,63169
32
32
  natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
@@ -39,13 +39,13 @@ natural_pdf/describe/elements.py,sha256=3Y541z5TQ2obrfZFiFi1YQMsCt3oYrhMHpD5j1tu
39
39
  natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
40
40
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
41
41
  natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
42
- natural_pdf/elements/base.py,sha256=92ukTtRCQFsa5KvKflChCt4mt0ZGS4ecGYCQTNMO4zU,58907
43
- natural_pdf/elements/element_collection.py,sha256=idM_BUWEfbCJ5Sq0Ae_KfbVHy8TdkNfzs7iWkFe_j2I,130707
42
+ natural_pdf/elements/base.py,sha256=iJHEejlYu8RNvlKYK2UHAnAlz6tXkiEaGnG2xYtVnuU,59635
43
+ natural_pdf/elements/element_collection.py,sha256=CVfnprzKTLeGSpvhGL2ZQVzZ8veSoFtCBlSSGDmX5lY,136594
44
44
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
45
45
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
46
- natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
47
- natural_pdf/elements/region.py,sha256=HF6KzeuudO9upVLIrPsp3omcziLcILE3nnzl1a-LvK0,165400
48
- natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
46
+ natural_pdf/elements/rect.py,sha256=kmUmhwnihd-aTweAO-LsngRDo5Iqmx7lcSa8ZBlE_2E,4544
47
+ natural_pdf/elements/region.py,sha256=DM8o0ptm86B2ouOqDgUK_av9cCN6G5iIlJ1VC3KfVWk,167379
48
+ natural_pdf/elements/text.py,sha256=dOiss-cSBYnK9j7KqmqmvJcCidBcIhckLJCW8lVz2es,21210
49
49
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
50
50
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
51
51
  natural_pdf/exporters/base.py,sha256=379sioW_hbkGb21sEVuJhbkkDO5MFsFtTUNO5TgG2YU,2101
@@ -85,7 +85,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
85
85
  natural_pdf/search/search_service_protocol.py,sha256=u8pbuWP96fnQEe6mnreY9DrdiDAHP6ZCY7phvSbFlP8,6697
86
86
  natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
87
87
  natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
88
- natural_pdf/selectors/parser.py,sha256=yV5Eb0VyNZocoYIXi7SMKsf8o66vrGNb-MeT27aEj-M,38977
88
+ natural_pdf/selectors/parser.py,sha256=WOoJlCxyz6F50uWJEBWSucANWMvihroezMBZMPCZxzE,40388
89
89
  natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
90
90
  natural_pdf/tables/result.py,sha256=-8ctA-jCJYSHtlfAoqTvhUwO5zSP2BQxxetAjqEsNyg,8665
91
91
  natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
@@ -101,38 +101,39 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
101
101
  natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
102
102
  natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
103
103
  natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
104
- natural_pdf/vision/__init__.py,sha256=RymMY-3WLQBlOZ4Dx4MmL9UH6I65hNjkwUJ7ymO5JfM,287
105
- natural_pdf/vision/mixin.py,sha256=OJwBABr74TWxP5seTKUmGj5zE9mWsBP_UKWU-Pr8V9A,8720
106
- natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs,5119
107
- natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
104
+ natural_pdf/vision/__init__.py,sha256=TkoQtdODlh0n_99dsjLIWKE9dgK0m4jfrui_cQ3gTwU,221
105
+ natural_pdf/vision/mixin.py,sha256=wlsX42cFUnUepZHsEfKBqXiDEPUwBG6-KN2Cx5qz_lw,10812
106
+ natural_pdf/vision/results.py,sha256=_NBRCKtDd1M3sWK7zHSym7-jpQqW4kR_iFFL4PvnBNo,6649
107
+ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2ZzU,17925
108
+ natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
108
109
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
109
110
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
110
- natural_pdf-0.2.12.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
111
+ natural_pdf-0.2.15.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
111
112
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
112
113
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
113
114
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
114
115
  optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
115
116
  optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
116
- temp/debug_cell_extraction.py,sha256=nE0Z470P40v8xZfWO1V3qgNaejs_pernEQaUOFeOJ1U,1527
117
- temp/debug_exclusion_overlap.py,sha256=RptJXwqBXy5gsvMF037KEx1o2QgjwEDkMB6TD5aJdqA,1644
118
- temp/debug_exclusions_guides.py,sha256=s8siep9te1KRJ2j0vH1tvDQnBlz7PKbHeCiYMrZL8jE,2096
119
- temp/debug_extra_guide.py,sha256=95Tim-YnmAR4kICw2XDKVDvlW5WsjK_51cv5-EV11rc,1236
120
- temp/debug_outer_boundaries.py,sha256=uJUJwojTxOU4VtbGUouuhV65IYzS6NDIVKxnS7o64nU,1456
121
- temp/debug_st_search.py,sha256=F4c_mUVi_d5AKaKIpQ0AnW1amDqAwALoQQj7wZj--J0,1021
122
117
  temp/fix_page_exclusions.py,sha256=YIj62zF38TdoBARAuSIvEbetl_JfXG-mp4v9p355qmo,1358
118
+ temp/test_draw_guides.py,sha256=_eSSBElGHQkd2QD_KA_Okw70v0dlY5m-1-C5SQwKAJw,642
119
+ temp/test_draw_guides_interactive.py,sha256=FsH-2ZQGsGx_8QfVCWUAkLbOcJz-VfiwROzQD4AD7kQ,926
123
120
  temp/test_exclusion_with_debug.py,sha256=CScxHvb43KrB5dzXuTOhuzjcBXZBdfYB5ygiKkEW26g,1393
124
121
  temp/test_find_exclusions_fix.py,sha256=1l5aEqnElcl3kiykdtmJFlVxQ1xMKGm1UckGYEQg--c,2103
125
122
  temp/test_find_exclusions_fix_no_recursion.py,sha256=qZspTBwxunRM93N_-fZ2fR5Lodj0ArQX3h10HlTXhfc,3592
126
123
  temp/test_fix_real_pdf.py,sha256=uuylxmpeAEbIix9wjl0Gri1sZlN61dBWTq6ZCyfvzF8,1454
127
124
  temp/test_fix_working.py,sha256=-Ryre1rXYA2EG_lmPZGYEGi8yz0slhHEXPJMYexZW84,1750
128
125
  temp/test_fixed_pdf_exclusions.py,sha256=Q5zxooKDvtTXo-dDsx3nsQw1ZVHX3TW47iZ_dXpFdrY,2168
126
+ temp/test_guide_draw_notebook.py,sha256=9yYRV5mfmVHiL1lnwNj-vksw45d1oWbAZpDGA7yZf-M,1583
129
127
  temp/test_horizontal_top_bottom.py,sha256=Mb3tjt9Z3wOTpzFOgK7i0K-j-_ynNh4vDu2x1L3nu-s,2163
128
+ temp/test_inline_js.py,sha256=xuQH8VQn7L4sogv6wd_Rwudx5p_Lt6we1h7U1LPTH-g,646
130
129
  temp/test_marker_order.py,sha256=TFZkMxRiNoZGVcdDivYnkIDNvwHaiyKUdYoy2rTTIiI,1417
131
130
  temp/test_original_exclusions_now_work.py,sha256=G6LmaF-P9Qhj0j4lT_4ncfCddllfP6L8F_x2prUBr9w,1904
132
131
  temp/test_pdf_exclusions_with_guides.py,sha256=QaMl0frgKC8kCPQ2BUI8kqyvqsIjQPXKV_St1rK3zxg,2754
133
132
  temp/test_region_exclusions_detailed.py,sha256=EftdW3JY3JH_LX5QlWKt-4drM-joPggK2fKUZRXVTMA,814
134
133
  temp/test_stripes_real_pdf.py,sha256=FIvDoJrnuioOMw1A0aTCCfZLeg99lusfe0Fb0MiqnhQ,2618
135
134
  temp/test_vertical_stripes.py,sha256=Yf3TJfb_faqAFzlgb7i5u6dDHjF4UMSHIGM99vangRk,1877
135
+ temp/test_widget_functionality.py,sha256=jsEGHYK1dWWa8uEcfGRRj1ReHRMzNoIaMZU4d-o-Djs,2448
136
+ temp/test_widget_simple.py,sha256=Vy_DKgPhPhUQ8nKw_KnhGTpwtmh5EEic0avEyW9hbOQ,1398
136
137
  tools/bad_pdf_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
138
  tools/bad_pdf_eval/analyser.py,sha256=oqSTo3NLyignp_XdCO9_SRCUUXMU8lfgDavKYZYNxws,13690
138
139
  tools/bad_pdf_eval/collate_summaries.py,sha256=L_YsdiqmwGIHYWTVJqo6gyazyn3GIQgpfGGKk8uwckk,5159
@@ -144,8 +145,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
144
145
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
145
146
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
146
147
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
147
- natural_pdf-0.2.12.dist-info/METADATA,sha256=jRNM0JxYvPDuqzD63earjbaUwQgXCjPYPLC5pLl49Uk,6960
148
- natural_pdf-0.2.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
149
- natural_pdf-0.2.12.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
150
- natural_pdf-0.2.12.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
151
- natural_pdf-0.2.12.dist-info/RECORD,,
148
+ natural_pdf-0.2.15.dist-info/METADATA,sha256=TfUHRd_THs6bk7Iwl8UdBjZfnWEXKg_tBiVn_Q9tv28,6960
149
+ natural_pdf-0.2.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
150
+ natural_pdf-0.2.15.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
151
+ natural_pdf-0.2.15.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
152
+ natural_pdf-0.2.15.dist-info/RECORD,,
@@ -0,0 +1,25 @@
1
+ """Example usage of the interactive guide drawing feature"""
2
+
3
+ # In a Jupyter notebook:
4
+ from natural_pdf import NaturalPDF
5
+
6
+ # Load a PDF
7
+ pdf = NaturalPDF.from_file("your_pdf.pdf")
8
+ page = pdf[0]
9
+
10
+ # Create guides
11
+ guides = page.guides()
12
+
13
+ # Detect some initial guides (optional)
14
+ guides.vertical.from_lines(n=5)
15
+ guides.horizontal.from_lines(n=5)
16
+
17
+ # Open interactive editor for vertical guides
18
+ guides.vertical.draw()
19
+
20
+ # Open interactive editor for horizontal guides
21
+ guides.horizontal.draw(width=600) # Smaller widget
22
+
23
+ # After editing, the guides are automatically updated
24
+ # You can now use them to extract tables:
25
+ table = page.extract_table(guides)
@@ -0,0 +1,30 @@
1
+ """Test the interactive guide drawing functionality"""
2
+
3
+ from natural_pdf.core.pdf import PDF
4
+ from natural_pdf.analyzers.guides import Guides
5
+
6
+ # Load a sample PDF
7
+ pdf = PDF("tests/sample_pdfs/simple_table.pdf")
8
+ page = pdf.pages[0]
9
+
10
+ # Create guides
11
+ guides = Guides(page)
12
+
13
+ # Add some initial guides for testing
14
+ guides.vertical.from_content()
15
+ guides.horizontal.from_lines(n=5)
16
+
17
+ print("Initial vertical guides:", list(guides.vertical))
18
+ print("Initial horizontal guides:", list(guides.horizontal))
19
+
20
+ # This would open the interactive widget in Jupyter
21
+ # guides.vertical.draw()
22
+
23
+ # For non-Jupyter testing, we can check the method exists
24
+ assert hasattr(guides.vertical, 'draw')
25
+ assert callable(guides.vertical.draw)
26
+
27
+ print("\nSuccess! The draw() method is available on GuidesList objects.")
28
+ print("To use it interactively, run this in a Jupyter notebook:")
29
+ print(" guides.vertical.draw()")
30
+ print(" guides.horizontal.draw(width=600)")
@@ -0,0 +1,47 @@
1
+ """Test script to verify the draw() method works"""
2
+
3
+ import sys
4
+ sys.path.insert(0, '.')
5
+
6
+ from natural_pdf.analyzers.guides import GuidesList, Guides
7
+
8
+ # Create a mock context for testing
9
+ class MockContext:
10
+ def __init__(self):
11
+ self.width = 600
12
+ self.height = 800
13
+
14
+ def render(self, resolution=150):
15
+ # Create a simple test image
16
+ from PIL import Image
17
+ img = Image.new('RGB', (int(self.width * resolution/72), int(self.height * resolution/72)), 'white')
18
+ return img
19
+
20
+ # Test that the draw method exists
21
+ mock_context = MockContext()
22
+ guides = Guides(mock_context)
23
+
24
+ # Add some test guides
25
+ guides.vertical.data = [100, 200, 300, 400, 500]
26
+ guides.horizontal.data = [150, 350, 550, 750]
27
+
28
+ print("Initial vertical guides:", list(guides.vertical))
29
+ print("Initial horizontal guides:", list(guides.horizontal))
30
+
31
+ # Check that draw method exists
32
+ assert hasattr(guides.vertical, 'draw')
33
+ assert callable(guides.vertical.draw)
34
+ assert hasattr(guides.horizontal, 'draw')
35
+ assert callable(guides.horizontal.draw)
36
+
37
+ print("\nSuccess! The draw() method is available.")
38
+ print("\nIn a Jupyter notebook, you would use:")
39
+ print(" guides.vertical.draw() # Interactive vertical guide editor")
40
+ print(" guides.horizontal.draw() # Interactive horizontal guide editor")
41
+ print("\nFeatures:")
42
+ print(" - Click to add new guides")
43
+ print(" - Click existing guides to select them")
44
+ print(" - Drag to move guides")
45
+ print(" - Delete key to remove selected guide")
46
+ print(" - Arrow keys to fine-tune position")
47
+ print(" - Enter to apply, Escape to cancel")
temp/test_inline_js.py ADDED
@@ -0,0 +1,22 @@
1
+ """Test inline JavaScript in HTML widget"""
2
+
3
+ import ipywidgets as widgets
4
+ from IPython.display import display
5
+
6
+ # Create an HTML widget with inline JavaScript
7
+ html_content = '''
8
+ <div id="test-div">Click me!</div>
9
+ <script type="text/javascript">
10
+ document.getElementById('test-div').addEventListener('click', function() {
11
+ alert('Clicked!');
12
+ this.innerHTML = 'Clicked at ' + new Date().toLocaleTimeString();
13
+ });
14
+ console.log('JavaScript is running!');
15
+ </script>
16
+ '''
17
+
18
+ # Display using widgets.HTML
19
+ html_widget = widgets.HTML(value=html_content)
20
+ display(html_widget)
21
+
22
+ print("If you see 'Click me!' above and can click it, JavaScript is working.")
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env python
2
+ """Test the guide widget functionality"""
3
+
4
+ import sys
5
+ import os
6
+
7
+ # Add parent directory to path
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ # Test importing and basic functionality
11
+ try:
12
+ from natural_pdf.analyzers.guides import InteractiveGuideWidget, GuidesList, _GUIDE_WIDGET_AVAILABLE
13
+ print("✓ Successfully imported InteractiveGuideWidget")
14
+
15
+ if _GUIDE_WIDGET_AVAILABLE:
16
+ print("✓ ipywidgets is available")
17
+
18
+ # Create a mock GuidesList for testing
19
+ class MockPage:
20
+ def __init__(self):
21
+ self.bbox = (0, 0, 595, 842) # A4 page size in points
22
+
23
+ def render(self, resolution=150):
24
+ # Mock render method
25
+ from PIL import Image
26
+ width = int(595 * resolution / 72)
27
+ height = int(842 * resolution / 72)
28
+ return Image.new('RGB', (width, height), color='white')
29
+
30
+ class MockGuides:
31
+ def __init__(self):
32
+ self.context = MockPage()
33
+
34
+ class MockGuidesList:
35
+ def __init__(self):
36
+ self.data = [100, 200, 300]
37
+ self._axis = 'vertical'
38
+ self._parent = MockGuides()
39
+
40
+ # Test creating the widget
41
+ mock_guides = MockGuidesList()
42
+ try:
43
+ widget = InteractiveGuideWidget(mock_guides)
44
+ print("✓ Successfully created InteractiveGuideWidget instance")
45
+ print(f" - Widget ID: {widget.widget_id}")
46
+ print(f" - Widget base classes: {InteractiveGuideWidget.__bases__}")
47
+
48
+ # Check if the widget has the expected methods
49
+ expected_methods = ['_generate_content', 'update_guides']
50
+ for method in expected_methods:
51
+ if hasattr(widget, method):
52
+ print(f" - Has method: {method}")
53
+ else:
54
+ print(f" - Missing method: {method}")
55
+
56
+ except Exception as e:
57
+ print(f"✗ Error creating widget: {e}")
58
+
59
+ else:
60
+ print("⚠ ipywidgets not available - widget functionality disabled")
61
+
62
+ except ImportError as e:
63
+ print(f"✗ Import error: {e}")
64
+
65
+ except Exception as e:
66
+ print(f"✗ Unexpected error: {e}")
67
+
68
+ print("\nWidget implementation test complete!")
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env python
2
+ """Simple test for the guide widget"""
3
+
4
+ import sys
5
+ import os
6
+
7
+ # Add parent directory to path
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ # Test importing the module
11
+ try:
12
+ from natural_pdf.analyzers.guides import InteractiveGuideWidget, _GUIDE_WIDGET_AVAILABLE
13
+ print(f"✓ Module imported successfully")
14
+ print(f"✓ Widget available: {_GUIDE_WIDGET_AVAILABLE}")
15
+
16
+ if _GUIDE_WIDGET_AVAILABLE:
17
+ print("✓ ipywidgets is installed and InteractiveGuideWidget is available")
18
+ else:
19
+ print("✗ ipywidgets is not installed")
20
+
21
+ except ImportError as e:
22
+ print(f"✗ Import error: {e}")
23
+ sys.exit(1)
24
+
25
+ # Check if we can create the widget class
26
+ if _GUIDE_WIDGET_AVAILABLE:
27
+ try:
28
+ # We can't actually instantiate it without a GuidesList, but we can check the class exists
29
+ print(f"✓ InteractiveGuideWidget class: {InteractiveGuideWidget}")
30
+ print(f"✓ Widget base classes: {InteractiveGuideWidget.__bases__}")
31
+
32
+ # Check methods
33
+ methods = [m for m in dir(InteractiveGuideWidget) if not m.startswith('_')]
34
+ print(f"✓ Public methods: {methods}")
35
+
36
+ except Exception as e:
37
+ print(f"✗ Error checking widget class: {e}")
38
+ else:
39
+ print("⚠ Skipping widget checks as ipywidgets is not available")
40
+
41
+ print("\nAll checks passed!")
@@ -1,42 +0,0 @@
1
- """Debug cell text extraction with exclusions"""
2
- from natural_pdf import PDF
3
- from natural_pdf.analyzers.guides import Guides
4
-
5
- pdf = PDF("pdfs/m27.pdf")
6
- page = pdf.pages[0]
7
-
8
- # Add exclusions
9
- pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
10
-
11
- # Check exclusions are registered
12
- print("Exclusions on page:")
13
- exclusions = page._get_exclusion_regions(debug=True)
14
-
15
- # Create guides and build grid
16
- headers = page.find(text="NUMBER").right(include_source=True).expand(top=3, bottom=3).find_all('text')
17
- guides = Guides(page)
18
- guides.vertical.from_content(headers, align='left')
19
- guides.horizontal.from_stripes()
20
-
21
- # Build grid and get cells
22
- grid_result = guides.build_grid(include_outer_boundaries=True)
23
- cells = grid_result["regions"]["cells"]
24
-
25
- print(f"\nTotal cells: {len(cells)}")
26
-
27
- # Check first row cells (these should be in excluded area)
28
- first_row_cells = [c for c in cells if c.bbox[1] < 90] # y < 90
29
- print(f"\nFirst row cells: {len(first_row_cells)}")
30
-
31
- for i, cell in enumerate(first_row_cells[:3]):
32
- print(f"\nCell {i}:")
33
- print(f" Bbox: {cell.bbox}")
34
- print(f" Raw text: {repr(cell.extract_text(apply_exclusions=False))}")
35
- print(f" With exclusions: {repr(cell.extract_text(apply_exclusions=True))}")
36
-
37
- # Now test the full table extraction
38
- print("\n\nFull table extraction:")
39
- result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
40
- df = result.to_df()
41
- print("\nFirst row of dataframe:")
42
- print(df.iloc[0].to_dict() if not df.empty else "Empty")
@@ -1,43 +0,0 @@
1
- """Debug how exclusions work with overlapping regions"""
2
- from natural_pdf import PDF
3
- from natural_pdf.analyzers.guides import Guides
4
-
5
- pdf = PDF("pdfs/m27.pdf")
6
- page = pdf.pages[0]
7
-
8
- # Add exclusion
9
- pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
10
-
11
- # Get the exclusion region
12
- exclusions = page._get_exclusion_regions()
13
- excl_region = exclusions[0]
14
- print(f"Exclusion region: {excl_region.bbox}")
15
- print(f"Exclusion bottom: {excl_region.bbox[3]}")
16
-
17
- # Create a test cell that overlaps the exclusion
18
- # Cell 1 from before: (32.06, 0.5, 73.18288, 79.53999999999996)
19
- test_cell = page.region(32.06, 0.5, 73.18288, 79.53999999999996)
20
-
21
- print(f"\nTest cell: {test_cell.bbox}")
22
- print(f"Cell overlaps exclusion: top={test_cell.bbox[1]} < excl_bottom={excl_region.bbox[3]}")
23
-
24
- # Extract text from different y-ranges
25
- print("\nText in different parts of the cell:")
26
-
27
- # Part above exclusion line (should be empty)
28
- upper_part = page.region(32.06, 0.5, 73.18288, 59.12)
29
- print(f"Upper part (0.5 to 59.12): '{upper_part.extract_text(apply_exclusions=True)}'")
30
-
31
- # Part below exclusion line (should have text)
32
- lower_part = page.region(32.06, 59.12, 73.18288, 79.54)
33
- print(f"Lower part (59.12 to 79.54): '{lower_part.extract_text()}'")
34
-
35
- # The whole cell
36
- print(f"Whole cell with exclusions: '{test_cell.extract_text(apply_exclusions=True)}'")
37
- print(f"Whole cell without exclusions: '{test_cell.extract_text(apply_exclusions=False)}'")
38
-
39
- # Check what text elements are in this region
40
- print("\nText elements in cell:")
41
- cell_texts = test_cell.find_all('text')
42
- for t in cell_texts[:5]:
43
- print(f" '{t.text}' at y={t.top:.2f}-{t.bottom:.2f}")