natural-pdf 0.1.36__py3-none-any.whl → 0.1.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +1053 -26
- natural_pdf/core/page.py +205 -45
- natural_pdf/core/pdf.py +16 -1
- natural_pdf/elements/collections.py +10 -0
- natural_pdf/elements/region.py +106 -14
- natural_pdf/elements/text.py +36 -2
- natural_pdf/flows/region.py +128 -26
- natural_pdf/selectors/parser.py +24 -0
- natural_pdf/utils/layout.py +26 -0
- natural_pdf/utils/text_extraction.py +76 -1
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
# natural_pdf/utils/text_extraction.py
|
2
2
|
import logging
|
3
|
-
|
3
|
+
import re
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
4
5
|
|
5
6
|
from pdfplumber.utils.geometry import (
|
6
7
|
cluster_objects,
|
@@ -173,6 +174,75 @@ def filter_chars_spatially(
|
|
173
174
|
return filtered_chars
|
174
175
|
|
175
176
|
|
177
|
+
def _apply_content_filter(
|
178
|
+
char_dicts: List[Dict[str, Any]],
|
179
|
+
content_filter: Union[str, Callable[[str], bool], List[str]]
|
180
|
+
) -> List[Dict[str, Any]]:
|
181
|
+
"""
|
182
|
+
Applies content filtering to character dictionaries based on their text content.
|
183
|
+
|
184
|
+
Args:
|
185
|
+
char_dicts: List of character dictionaries to filter.
|
186
|
+
content_filter: Can be:
|
187
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
188
|
+
- A callable that takes text and returns True to KEEP the character
|
189
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
Filtered list of character dictionaries.
|
193
|
+
"""
|
194
|
+
if not char_dicts or content_filter is None:
|
195
|
+
return char_dicts
|
196
|
+
|
197
|
+
initial_count = len(char_dicts)
|
198
|
+
filtered_chars = []
|
199
|
+
|
200
|
+
# Handle different filter types
|
201
|
+
if isinstance(content_filter, str):
|
202
|
+
# Single regex pattern - exclude matching characters
|
203
|
+
try:
|
204
|
+
pattern = re.compile(content_filter)
|
205
|
+
for char_dict in char_dicts:
|
206
|
+
text = char_dict.get("text", "")
|
207
|
+
if not pattern.search(text):
|
208
|
+
filtered_chars.append(char_dict)
|
209
|
+
except re.error as e:
|
210
|
+
logger.warning(f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering.")
|
211
|
+
return char_dicts
|
212
|
+
|
213
|
+
elif isinstance(content_filter, list):
|
214
|
+
# List of regex patterns - exclude characters matching ANY pattern
|
215
|
+
try:
|
216
|
+
patterns = [re.compile(p) for p in content_filter]
|
217
|
+
for char_dict in char_dicts:
|
218
|
+
text = char_dict.get("text", "")
|
219
|
+
if not any(pattern.search(text) for pattern in patterns):
|
220
|
+
filtered_chars.append(char_dict)
|
221
|
+
except re.error as e:
|
222
|
+
logger.warning(f"Invalid regex pattern in list: {e}. Skipping content filtering.")
|
223
|
+
return char_dicts
|
224
|
+
|
225
|
+
elif callable(content_filter):
|
226
|
+
# Callable filter - keep characters where function returns True
|
227
|
+
try:
|
228
|
+
for char_dict in char_dicts:
|
229
|
+
text = char_dict.get("text", "")
|
230
|
+
if content_filter(text):
|
231
|
+
filtered_chars.append(char_dict)
|
232
|
+
except Exception as e:
|
233
|
+
logger.warning(f"Error in content filter function: {e}. Skipping content filtering.")
|
234
|
+
return char_dicts
|
235
|
+
else:
|
236
|
+
logger.warning(f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering.")
|
237
|
+
return char_dicts
|
238
|
+
|
239
|
+
filtered_count = initial_count - len(filtered_chars)
|
240
|
+
if filtered_count > 0:
|
241
|
+
logger.debug(f"Content filter removed {filtered_count} characters.")
|
242
|
+
|
243
|
+
return filtered_chars
|
244
|
+
|
245
|
+
|
176
246
|
def generate_text_layout(
|
177
247
|
char_dicts: List[Dict[str, Any]],
|
178
248
|
layout_context_bbox: Optional[Tuple[float, float, float, float]] = None,
|
@@ -206,6 +276,11 @@ def generate_text_layout(
|
|
206
276
|
# Make a working copy of user_kwargs so we can safely pop custom keys
|
207
277
|
incoming_kwargs = user_kwargs.copy() if user_kwargs else {}
|
208
278
|
|
279
|
+
# --- Apply content filtering if specified ---
|
280
|
+
content_filter = incoming_kwargs.pop("content_filter", None)
|
281
|
+
if content_filter is not None:
|
282
|
+
valid_char_dicts = _apply_content_filter(valid_char_dicts, content_filter)
|
283
|
+
|
209
284
|
# --- Handle custom 'strip' option ------------------------------------
|
210
285
|
# * strip=True – post-process the final string to remove leading/trailing
|
211
286
|
# whitespace (typically used when layout=False)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
2
2
|
natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
|
3
3
|
natural_pdf/analyzers/__init__.py,sha256=M5oD4oEsIBWrzgTaeg4uBYrC4OPd7Mp7tz8ootKN_l8,1134
|
4
|
-
natural_pdf/analyzers/guides.py,sha256=
|
4
|
+
natural_pdf/analyzers/guides.py,sha256=D55ov45PE7mhqvEnarn82y5hG6gmDzk7tYw233LnluA,141896
|
5
5
|
natural_pdf/analyzers/shape_detection_mixin.py,sha256=Ef1o73QYVXQ2QcQMM_W9XRwY6vaIQHgxzD7etJ6LbiM,62820
|
6
6
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
7
7
|
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
@@ -27,8 +27,8 @@ natural_pdf/collections/pdf_collection.py,sha256=sDVEbFMNME_2OaHIsCoR_W7V1cAATNw
|
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
|
29
29
|
natural_pdf/core/highlighting_service.py,sha256=2tBrrEq6d6hz5f6Yf7z5TysJdlTyuHTURBnQxokJnDM,40645
|
30
|
-
natural_pdf/core/page.py,sha256=
|
31
|
-
natural_pdf/core/pdf.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=MwIENkMjEKStC6RlD3SBrqmyZt_MKzrIY7vLBFIvrwY,142529
|
31
|
+
natural_pdf/core/pdf.py,sha256=2hK3yRVRxEQMVy1v4w6P26VGoDpCu_3FNkYgN-LO4hA,93221
|
32
32
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
33
33
|
natural_pdf/describe/base.py,sha256=CLhZXYQO6SOPUVWLt6VwZ7MK48t_6wgPMyFMLtTCKRc,18166
|
34
34
|
natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
|
@@ -36,12 +36,12 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
|
|
36
36
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
37
37
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
38
38
|
natural_pdf/elements/base.py,sha256=-ZAcc8lb2aSWTKcprwKTvnR6hsDGDm7T8a1Y9V38E_A,52042
|
39
|
-
natural_pdf/elements/collections.py,sha256=
|
39
|
+
natural_pdf/elements/collections.py,sha256=_B03lJA1n147alE4xvn6qQ9uZWI8kb8VGxpchghqxqg,131834
|
40
40
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
41
41
|
natural_pdf/elements/line.py,sha256=mHSeV-ZABY-Cc_K_NpFL53OGtTWlexYDlMvZc8_Vrx8,3845
|
42
42
|
natural_pdf/elements/rect.py,sha256=QuQg0Qo7XYQKBac-3Ss0n0ELV6icdPcrygWM2VWzeX8,3325
|
43
|
-
natural_pdf/elements/region.py,sha256=
|
44
|
-
natural_pdf/elements/text.py,sha256=
|
43
|
+
natural_pdf/elements/region.py,sha256=ewY9HmV_VN6tN_VKtHj7dtk6nh7hrot-pW5Soz5iMg0,148150
|
44
|
+
natural_pdf/elements/text.py,sha256=giPJQaXuOBCviQ7QKVx_ZMrKFVpgQAsaCS2-kn-8mp0,20530
|
45
45
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
46
46
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
47
47
|
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
@@ -60,7 +60,7 @@ natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU
|
|
60
60
|
natural_pdf/flows/collections.py,sha256=iF8SsfKKb-YVIGi3m-yMRnfKgo_0n_EGhojnYK24h-Q,28493
|
61
61
|
natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
|
62
62
|
natural_pdf/flows/flow.py,sha256=ukkUqXsZmEw-QJEiVqEBLC8ktfBG2Bw56_RR1OEsd24,12802
|
63
|
-
natural_pdf/flows/region.py,sha256=
|
63
|
+
natural_pdf/flows/region.py,sha256=jRenBFh2ZmFNklNnGkzCsAM0OfMjbP8fo0p7BiVCl_k,31795
|
64
64
|
natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
|
65
65
|
natural_pdf/ocr/engine.py,sha256=SwNlWydtHbrIghV5JD_j5B4-rnjCMYIWUIEARag-zHw,11839
|
66
66
|
natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
|
@@ -81,7 +81,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
|
|
81
81
|
natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
|
82
82
|
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
83
83
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
84
|
-
natural_pdf/selectors/parser.py,sha256=
|
84
|
+
natural_pdf/selectors/parser.py,sha256=W1gZuBhGy2uHqCoExzCAFbsiVMKYSgUfCc9cr4rO1V0,37540
|
85
85
|
natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
|
86
86
|
natural_pdf/tables/result.py,sha256=hrGIWDkImpdxsGzugcQKU-qrTgHwwfOigJDFdYl8aUc,3994
|
87
87
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
@@ -90,14 +90,15 @@ natural_pdf/utils/bidi_mirror.py,sha256=jJEES0xDrMfo5Me8kHMxHv4COS51PitnYi2EvKv3
|
|
90
90
|
natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,995
|
91
91
|
natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
|
92
92
|
natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
|
93
|
+
natural_pdf/utils/layout.py,sha256=nCmXiflatJoh-uFtDzfNnlYuRu7Pu9voJWUek6rEMzI,752
|
93
94
|
natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
|
94
95
|
natural_pdf/utils/packaging.py,sha256=e7U2wWvpunlAWpPFexNkD_c4dYbPp5LcKo7og4bNGvk,22411
|
95
96
|
natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
|
96
|
-
natural_pdf/utils/text_extraction.py,sha256=
|
97
|
+
natural_pdf/utils/text_extraction.py,sha256=HYWlYGPfafwzsuMyfL5oQhvcD4NobbvC_aCpcDs9I64,13901
|
97
98
|
natural_pdf/utils/visualization.py,sha256=olDkWtuVzP0NxRg0CP0DL-eXNCY7Bs-SH-2Xn-cjbo0,9370
|
98
99
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
99
100
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
100
|
-
natural_pdf-0.1.
|
101
|
+
natural_pdf-0.1.37.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
101
102
|
optimization/memory_comparison.py,sha256=F90D_5WhliSGAct_lyx93xd4q4F-jeo8QpGyDr8tmNw,6543
|
102
103
|
optimization/pdf_analyzer.py,sha256=xf6h-FNlqCpsm8NriXcs_bQZOB8eQkxgGGKVRL_jgCM,19347
|
103
104
|
optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
|
@@ -114,8 +115,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
114
115
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
115
116
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
116
117
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
117
|
-
natural_pdf-0.1.
|
118
|
-
natural_pdf-0.1.
|
119
|
-
natural_pdf-0.1.
|
120
|
-
natural_pdf-0.1.
|
121
|
-
natural_pdf-0.1.
|
118
|
+
natural_pdf-0.1.37.dist-info/METADATA,sha256=1POawL7Edgjod2Qt1TO-2DhUkVesip-OnB0KkQCgGQ0,6739
|
119
|
+
natural_pdf-0.1.37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
120
|
+
natural_pdf-0.1.37.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
121
|
+
natural_pdf-0.1.37.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
122
|
+
natural_pdf-0.1.37.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|