natural-pdf 0.1.35__py3-none-any.whl → 0.1.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  # natural_pdf/utils/text_extraction.py
2
2
  import logging
3
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
3
+ import re
4
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
4
5
 
5
6
  from pdfplumber.utils.geometry import (
6
7
  cluster_objects,
@@ -173,6 +174,75 @@ def filter_chars_spatially(
173
174
  return filtered_chars
174
175
 
175
176
 
177
+ def _apply_content_filter(
178
+ char_dicts: List[Dict[str, Any]],
179
+ content_filter: Union[str, Callable[[str], bool], List[str]]
180
+ ) -> List[Dict[str, Any]]:
181
+ """
182
+ Applies content filtering to character dictionaries based on their text content.
183
+
184
+ Args:
185
+ char_dicts: List of character dictionaries to filter.
186
+ content_filter: Can be:
187
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
188
+ - A callable that takes text and returns True to KEEP the character
189
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
190
+
191
+ Returns:
192
+ Filtered list of character dictionaries.
193
+ """
194
+ if not char_dicts or content_filter is None:
195
+ return char_dicts
196
+
197
+ initial_count = len(char_dicts)
198
+ filtered_chars = []
199
+
200
+ # Handle different filter types
201
+ if isinstance(content_filter, str):
202
+ # Single regex pattern - exclude matching characters
203
+ try:
204
+ pattern = re.compile(content_filter)
205
+ for char_dict in char_dicts:
206
+ text = char_dict.get("text", "")
207
+ if not pattern.search(text):
208
+ filtered_chars.append(char_dict)
209
+ except re.error as e:
210
+ logger.warning(f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering.")
211
+ return char_dicts
212
+
213
+ elif isinstance(content_filter, list):
214
+ # List of regex patterns - exclude characters matching ANY pattern
215
+ try:
216
+ patterns = [re.compile(p) for p in content_filter]
217
+ for char_dict in char_dicts:
218
+ text = char_dict.get("text", "")
219
+ if not any(pattern.search(text) for pattern in patterns):
220
+ filtered_chars.append(char_dict)
221
+ except re.error as e:
222
+ logger.warning(f"Invalid regex pattern in list: {e}. Skipping content filtering.")
223
+ return char_dicts
224
+
225
+ elif callable(content_filter):
226
+ # Callable filter - keep characters where function returns True
227
+ try:
228
+ for char_dict in char_dicts:
229
+ text = char_dict.get("text", "")
230
+ if content_filter(text):
231
+ filtered_chars.append(char_dict)
232
+ except Exception as e:
233
+ logger.warning(f"Error in content filter function: {e}. Skipping content filtering.")
234
+ return char_dicts
235
+ else:
236
+ logger.warning(f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering.")
237
+ return char_dicts
238
+
239
+ filtered_count = initial_count - len(filtered_chars)
240
+ if filtered_count > 0:
241
+ logger.debug(f"Content filter removed {filtered_count} characters.")
242
+
243
+ return filtered_chars
244
+
245
+
176
246
  def generate_text_layout(
177
247
  char_dicts: List[Dict[str, Any]],
178
248
  layout_context_bbox: Optional[Tuple[float, float, float, float]] = None,
@@ -206,6 +276,11 @@ def generate_text_layout(
206
276
  # Make a working copy of user_kwargs so we can safely pop custom keys
207
277
  incoming_kwargs = user_kwargs.copy() if user_kwargs else {}
208
278
 
279
+ # --- Apply content filtering if specified ---
280
+ content_filter = incoming_kwargs.pop("content_filter", None)
281
+ if content_filter is not None:
282
+ valid_char_dicts = _apply_content_filter(valid_char_dicts, content_filter)
283
+
209
284
  # --- Handle custom 'strip' option ------------------------------------
210
285
  # * strip=True – post-process the final string to remove leading/trailing
211
286
  # whitespace (typically used when layout=False)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.35
3
+ Version: 0.1.37
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
11
11
  Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Requires-Dist: scikit-learn
14
15
  Requires-Dist: markdown
15
16
  Requires-Dist: pandas
16
17
  Requires-Dist: pdfplumber
@@ -1,7 +1,7 @@
1
1
  natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
2
2
  natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
3
- natural_pdf/analyzers/__init__.py,sha256=IPu_PMKFviDeEIeiC8_2KdeqH7z8OQ6q2v980hkByFY,672
4
- natural_pdf/analyzers/guides.py,sha256=5Lqc51trtqmLvjxLjDS__mgeyviRrjV-CIIT69RmEt4,92327
3
+ natural_pdf/analyzers/__init__.py,sha256=M5oD4oEsIBWrzgTaeg4uBYrC4OPd7Mp7tz8ootKN_l8,1134
4
+ natural_pdf/analyzers/guides.py,sha256=D55ov45PE7mhqvEnarn82y5hG6gmDzk7tYw233LnluA,141896
5
5
  natural_pdf/analyzers/shape_detection_mixin.py,sha256=Ef1o73QYVXQ2QcQMM_W9XRwY6vaIQHgxzD7etJ6LbiM,62820
6
6
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
7
7
  natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
@@ -27,8 +27,8 @@ natural_pdf/collections/pdf_collection.py,sha256=sDVEbFMNME_2OaHIsCoR_W7V1cAATNw
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
28
  natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
29
29
  natural_pdf/core/highlighting_service.py,sha256=2tBrrEq6d6hz5f6Yf7z5TysJdlTyuHTURBnQxokJnDM,40645
30
- natural_pdf/core/page.py,sha256=Jw5SDshnHesqoC4yhtKEokeV08wMHuWZyWs5kDMOAjo,133204
31
- natural_pdf/core/pdf.py,sha256=9t8Ks-AZp3yjH_lRkFZAyIkjUQoCTRbmXK7vSi1e4UE,92415
30
+ natural_pdf/core/page.py,sha256=MwIENkMjEKStC6RlD3SBrqmyZt_MKzrIY7vLBFIvrwY,142529
31
+ natural_pdf/core/pdf.py,sha256=2hK3yRVRxEQMVy1v4w6P26VGoDpCu_3FNkYgN-LO4hA,93221
32
32
  natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
33
33
  natural_pdf/describe/base.py,sha256=CLhZXYQO6SOPUVWLt6VwZ7MK48t_6wgPMyFMLtTCKRc,18166
34
34
  natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
@@ -36,12 +36,12 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
36
36
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
37
37
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
38
38
  natural_pdf/elements/base.py,sha256=-ZAcc8lb2aSWTKcprwKTvnR6hsDGDm7T8a1Y9V38E_A,52042
39
- natural_pdf/elements/collections.py,sha256=7i279l8kpgzRyvjRr13n1BeqbC5ufwYx7lu_WmfXWTE,131199
39
+ natural_pdf/elements/collections.py,sha256=_B03lJA1n147alE4xvn6qQ9uZWI8kb8VGxpchghqxqg,131834
40
40
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
41
41
  natural_pdf/elements/line.py,sha256=mHSeV-ZABY-Cc_K_NpFL53OGtTWlexYDlMvZc8_Vrx8,3845
42
42
  natural_pdf/elements/rect.py,sha256=QuQg0Qo7XYQKBac-3Ss0n0ELV6icdPcrygWM2VWzeX8,3325
43
- natural_pdf/elements/region.py,sha256=EqwtZJ2qgMyykuLVv2zO51oKJoSU4Hl7UA_mqTqRzmQ,143419
44
- natural_pdf/elements/text.py,sha256=409RqADe0FYG_i99n6Dy0hl_fWTtBHRCzCq7BP0eAL8,18854
43
+ natural_pdf/elements/region.py,sha256=ewY9HmV_VN6tN_VKtHj7dtk6nh7hrot-pW5Soz5iMg0,148150
44
+ natural_pdf/elements/text.py,sha256=giPJQaXuOBCviQ7QKVx_ZMrKFVpgQAsaCS2-kn-8mp0,20530
45
45
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
46
46
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
47
47
  natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
@@ -60,7 +60,7 @@ natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU
60
60
  natural_pdf/flows/collections.py,sha256=iF8SsfKKb-YVIGi3m-yMRnfKgo_0n_EGhojnYK24h-Q,28493
61
61
  natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
62
62
  natural_pdf/flows/flow.py,sha256=ukkUqXsZmEw-QJEiVqEBLC8ktfBG2Bw56_RR1OEsd24,12802
63
- natural_pdf/flows/region.py,sha256=nB634NCuC2BzBHuXAn8Ynf5lwZnR5mWb3RD36iEaPYY,27659
63
+ natural_pdf/flows/region.py,sha256=jRenBFh2ZmFNklNnGkzCsAM0OfMjbP8fo0p7BiVCl_k,31795
64
64
  natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
65
65
  natural_pdf/ocr/engine.py,sha256=SwNlWydtHbrIghV5JD_j5B4-rnjCMYIWUIEARag-zHw,11839
66
66
  natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
@@ -81,7 +81,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
81
81
  natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
82
82
  natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
83
83
  natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
84
- natural_pdf/selectors/parser.py,sha256=Flxjo_ZODBLQM8DQlQGqZTTQDyea3zUTzO9L2dtVabM,36402
84
+ natural_pdf/selectors/parser.py,sha256=W1gZuBhGy2uHqCoExzCAFbsiVMKYSgUfCc9cr4rO1V0,37540
85
85
  natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
86
86
  natural_pdf/tables/result.py,sha256=hrGIWDkImpdxsGzugcQKU-qrTgHwwfOigJDFdYl8aUc,3994
87
87
  natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
@@ -90,14 +90,15 @@ natural_pdf/utils/bidi_mirror.py,sha256=jJEES0xDrMfo5Me8kHMxHv4COS51PitnYi2EvKv3
90
90
  natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,995
91
91
  natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
92
92
  natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
93
+ natural_pdf/utils/layout.py,sha256=nCmXiflatJoh-uFtDzfNnlYuRu7Pu9voJWUek6rEMzI,752
93
94
  natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
94
95
  natural_pdf/utils/packaging.py,sha256=e7U2wWvpunlAWpPFexNkD_c4dYbPp5LcKo7og4bNGvk,22411
95
96
  natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
96
- natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6lSjBaOk,10854
97
+ natural_pdf/utils/text_extraction.py,sha256=HYWlYGPfafwzsuMyfL5oQhvcD4NobbvC_aCpcDs9I64,13901
97
98
  natural_pdf/utils/visualization.py,sha256=olDkWtuVzP0NxRg0CP0DL-eXNCY7Bs-SH-2Xn-cjbo0,9370
98
99
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
99
100
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
100
- natural_pdf-0.1.35.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
101
+ natural_pdf-0.1.37.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
101
102
  optimization/memory_comparison.py,sha256=F90D_5WhliSGAct_lyx93xd4q4F-jeo8QpGyDr8tmNw,6543
102
103
  optimization/pdf_analyzer.py,sha256=xf6h-FNlqCpsm8NriXcs_bQZOB8eQkxgGGKVRL_jgCM,19347
103
104
  optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
@@ -114,8 +115,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
114
115
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
115
116
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
116
117
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
117
- natural_pdf-0.1.35.dist-info/METADATA,sha256=SVdCwYrjweXrrmU8m2korCIMJENbN9zDasRCi2pkb8E,6711
118
- natural_pdf-0.1.35.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
119
- natural_pdf-0.1.35.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
120
- natural_pdf-0.1.35.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
121
- natural_pdf-0.1.35.dist-info/RECORD,,
118
+ natural_pdf-0.1.37.dist-info/METADATA,sha256=1POawL7Edgjod2Qt1TO-2DhUkVesip-OnB0KkQCgGQ0,6739
119
+ natural_pdf-0.1.37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
120
+ natural_pdf-0.1.37.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
121
+ natural_pdf-0.1.37.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
122
+ natural_pdf-0.1.37.dist-info/RECORD,,