natural-pdf 0.1.36__py3-none-any.whl → 0.1.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +1053 -26
- natural_pdf/core/page.py +274 -46
- natural_pdf/core/pdf.py +116 -30
- natural_pdf/elements/collections.py +48 -7
- natural_pdf/elements/region.py +179 -17
- natural_pdf/elements/text.py +36 -2
- natural_pdf/flows/region.py +128 -26
- natural_pdf/selectors/parser.py +24 -0
- natural_pdf/utils/layout.py +26 -0
- natural_pdf/utils/text_extraction.py +76 -1
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/top_level.txt +0 -0
natural_pdf/selectors/parser.py
CHANGED
@@ -24,6 +24,7 @@ This enables powerful document navigation like:
|
|
24
24
|
- page.find('text[size>12]:bold:contains("Summary")')
|
25
25
|
- page.find_all('rect[color~="red"]:above(text:contains("Total"))')
|
26
26
|
- page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
|
27
|
+
- page.find('text:regex("[\u2500-\u257F]")') # Box drawing characters
|
27
28
|
"""
|
28
29
|
|
29
30
|
import ast
|
@@ -748,6 +749,29 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
748
749
|
|
749
750
|
filter_lambda = contains_check
|
750
751
|
|
752
|
+
# --- Handle :regex pseudo-class (same as :contains with regex=True) ---
|
753
|
+
elif name == "regex" and args is not None:
|
754
|
+
ignore_case = not kwargs.get("case", True) # Default case sensitive
|
755
|
+
filter_name = f"pseudo-class :regex({args!r}, ignore_case={ignore_case})"
|
756
|
+
|
757
|
+
def regex_check(element, args=args, ignore_case=ignore_case):
|
758
|
+
if not hasattr(element, "text") or not element.text:
|
759
|
+
return False # Element must have non-empty text
|
760
|
+
|
761
|
+
element_text = element.text
|
762
|
+
search_term = str(args) # Ensure args is string
|
763
|
+
|
764
|
+
try:
|
765
|
+
pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
|
766
|
+
return bool(pattern.search(element_text))
|
767
|
+
except re.error as e:
|
768
|
+
logger.warning(
|
769
|
+
f"Invalid regex '{search_term}' in :regex selector: {e}. Returning False."
|
770
|
+
)
|
771
|
+
return False
|
772
|
+
|
773
|
+
filter_lambda = regex_check
|
774
|
+
|
751
775
|
# --- Handle :startswith and :starts-with (alias) --- #
|
752
776
|
elif name in ("starts-with", "startswith") and args is not None:
|
753
777
|
filter_name = f"pseudo-class :{name}({args!r})"
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from typing import List, Optional, Tuple
|
2
|
+
|
3
|
+
|
4
|
+
def merge_bboxes(
|
5
|
+
bboxes: List[Optional[Tuple[float, float, float, float]]]
|
6
|
+
) -> Optional[Tuple[float, float, float, float]]:
|
7
|
+
"""
|
8
|
+
Merge multiple bounding boxes into a single one that encompasses all of them.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
bboxes: A list of bbox tuples (x0, top, x1, bottom). Can contain None values.
|
12
|
+
|
13
|
+
Returns:
|
14
|
+
A single merged bbox tuple, or None if no valid bboxes are provided.
|
15
|
+
"""
|
16
|
+
if not bboxes:
|
17
|
+
return None
|
18
|
+
|
19
|
+
# Filter out None or invalid bboxes
|
20
|
+
valid_bboxes = [b for b in bboxes if b and len(b) == 4]
|
21
|
+
if not valid_bboxes:
|
22
|
+
return None
|
23
|
+
|
24
|
+
x0s, tops, x1s, bottoms = zip(*valid_bboxes)
|
25
|
+
|
26
|
+
return (min(x0s), min(tops), max(x1s), max(bottoms))
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# natural_pdf/utils/text_extraction.py
|
2
2
|
import logging
|
3
|
-
|
3
|
+
import re
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
4
5
|
|
5
6
|
from pdfplumber.utils.geometry import (
|
6
7
|
cluster_objects,
|
@@ -173,6 +174,75 @@ def filter_chars_spatially(
|
|
173
174
|
return filtered_chars
|
174
175
|
|
175
176
|
|
177
|
+
def _apply_content_filter(
|
178
|
+
char_dicts: List[Dict[str, Any]],
|
179
|
+
content_filter: Union[str, Callable[[str], bool], List[str]]
|
180
|
+
) -> List[Dict[str, Any]]:
|
181
|
+
"""
|
182
|
+
Applies content filtering to character dictionaries based on their text content.
|
183
|
+
|
184
|
+
Args:
|
185
|
+
char_dicts: List of character dictionaries to filter.
|
186
|
+
content_filter: Can be:
|
187
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
188
|
+
- A callable that takes text and returns True to KEEP the character
|
189
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
Filtered list of character dictionaries.
|
193
|
+
"""
|
194
|
+
if not char_dicts or content_filter is None:
|
195
|
+
return char_dicts
|
196
|
+
|
197
|
+
initial_count = len(char_dicts)
|
198
|
+
filtered_chars = []
|
199
|
+
|
200
|
+
# Handle different filter types
|
201
|
+
if isinstance(content_filter, str):
|
202
|
+
# Single regex pattern - exclude matching characters
|
203
|
+
try:
|
204
|
+
pattern = re.compile(content_filter)
|
205
|
+
for char_dict in char_dicts:
|
206
|
+
text = char_dict.get("text", "")
|
207
|
+
if not pattern.search(text):
|
208
|
+
filtered_chars.append(char_dict)
|
209
|
+
except re.error as e:
|
210
|
+
logger.warning(f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering.")
|
211
|
+
return char_dicts
|
212
|
+
|
213
|
+
elif isinstance(content_filter, list):
|
214
|
+
# List of regex patterns - exclude characters matching ANY pattern
|
215
|
+
try:
|
216
|
+
patterns = [re.compile(p) for p in content_filter]
|
217
|
+
for char_dict in char_dicts:
|
218
|
+
text = char_dict.get("text", "")
|
219
|
+
if not any(pattern.search(text) for pattern in patterns):
|
220
|
+
filtered_chars.append(char_dict)
|
221
|
+
except re.error as e:
|
222
|
+
logger.warning(f"Invalid regex pattern in list: {e}. Skipping content filtering.")
|
223
|
+
return char_dicts
|
224
|
+
|
225
|
+
elif callable(content_filter):
|
226
|
+
# Callable filter - keep characters where function returns True
|
227
|
+
try:
|
228
|
+
for char_dict in char_dicts:
|
229
|
+
text = char_dict.get("text", "")
|
230
|
+
if content_filter(text):
|
231
|
+
filtered_chars.append(char_dict)
|
232
|
+
except Exception as e:
|
233
|
+
logger.warning(f"Error in content filter function: {e}. Skipping content filtering.")
|
234
|
+
return char_dicts
|
235
|
+
else:
|
236
|
+
logger.warning(f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering.")
|
237
|
+
return char_dicts
|
238
|
+
|
239
|
+
filtered_count = initial_count - len(filtered_chars)
|
240
|
+
if filtered_count > 0:
|
241
|
+
logger.debug(f"Content filter removed {filtered_count} characters.")
|
242
|
+
|
243
|
+
return filtered_chars
|
244
|
+
|
245
|
+
|
176
246
|
def generate_text_layout(
|
177
247
|
char_dicts: List[Dict[str, Any]],
|
178
248
|
layout_context_bbox: Optional[Tuple[float, float, float, float]] = None,
|
@@ -206,6 +276,11 @@ def generate_text_layout(
|
|
206
276
|
# Make a working copy of user_kwargs so we can safely pop custom keys
|
207
277
|
incoming_kwargs = user_kwargs.copy() if user_kwargs else {}
|
208
278
|
|
279
|
+
# --- Apply content filtering if specified ---
|
280
|
+
content_filter = incoming_kwargs.pop("content_filter", None)
|
281
|
+
if content_filter is not None:
|
282
|
+
valid_char_dicts = _apply_content_filter(valid_char_dicts, content_filter)
|
283
|
+
|
209
284
|
# --- Handle custom 'strip' option ------------------------------------
|
210
285
|
# * strip=True – post-process the final string to remove leading/trailing
|
211
286
|
# whitespace (typically used when layout=False)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
2
2
|
natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
|
3
3
|
natural_pdf/analyzers/__init__.py,sha256=M5oD4oEsIBWrzgTaeg4uBYrC4OPd7Mp7tz8ootKN_l8,1134
|
4
|
-
natural_pdf/analyzers/guides.py,sha256=
|
4
|
+
natural_pdf/analyzers/guides.py,sha256=D55ov45PE7mhqvEnarn82y5hG6gmDzk7tYw233LnluA,141896
|
5
5
|
natural_pdf/analyzers/shape_detection_mixin.py,sha256=Ef1o73QYVXQ2QcQMM_W9XRwY6vaIQHgxzD7etJ6LbiM,62820
|
6
6
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
7
7
|
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
@@ -27,8 +27,8 @@ natural_pdf/collections/pdf_collection.py,sha256=sDVEbFMNME_2OaHIsCoR_W7V1cAATNw
|
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
|
29
29
|
natural_pdf/core/highlighting_service.py,sha256=2tBrrEq6d6hz5f6Yf7z5TysJdlTyuHTURBnQxokJnDM,40645
|
30
|
-
natural_pdf/core/page.py,sha256=
|
31
|
-
natural_pdf/core/pdf.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=iWokHLuSrQ71kxB_tTWkCp_O-i72urR4iGFUIzKoH8k,145351
|
31
|
+
natural_pdf/core/pdf.py,sha256=5M1gB9psqwJCgE0w7PQ_G1XVa_XCmyNNmluZO7pIyZ4,97112
|
32
32
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
33
33
|
natural_pdf/describe/base.py,sha256=CLhZXYQO6SOPUVWLt6VwZ7MK48t_6wgPMyFMLtTCKRc,18166
|
34
34
|
natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
|
@@ -36,12 +36,12 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
|
|
36
36
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
37
37
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
38
38
|
natural_pdf/elements/base.py,sha256=-ZAcc8lb2aSWTKcprwKTvnR6hsDGDm7T8a1Y9V38E_A,52042
|
39
|
-
natural_pdf/elements/collections.py,sha256=
|
39
|
+
natural_pdf/elements/collections.py,sha256=qtHEaLPxZ6i3zPQsbSOw_KMAr9oDMWR1516ilSMSDeY,133189
|
40
40
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
41
41
|
natural_pdf/elements/line.py,sha256=mHSeV-ZABY-Cc_K_NpFL53OGtTWlexYDlMvZc8_Vrx8,3845
|
42
42
|
natural_pdf/elements/rect.py,sha256=QuQg0Qo7XYQKBac-3Ss0n0ELV6icdPcrygWM2VWzeX8,3325
|
43
|
-
natural_pdf/elements/region.py,sha256=
|
44
|
-
natural_pdf/elements/text.py,sha256=
|
43
|
+
natural_pdf/elements/region.py,sha256=s3iFTq6QNiEgSAEV9ywt-3oQW5_swTvB6FNMgANpvmA,151055
|
44
|
+
natural_pdf/elements/text.py,sha256=giPJQaXuOBCviQ7QKVx_ZMrKFVpgQAsaCS2-kn-8mp0,20530
|
45
45
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
46
46
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
47
47
|
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
@@ -60,7 +60,7 @@ natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU
|
|
60
60
|
natural_pdf/flows/collections.py,sha256=iF8SsfKKb-YVIGi3m-yMRnfKgo_0n_EGhojnYK24h-Q,28493
|
61
61
|
natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
|
62
62
|
natural_pdf/flows/flow.py,sha256=ukkUqXsZmEw-QJEiVqEBLC8ktfBG2Bw56_RR1OEsd24,12802
|
63
|
-
natural_pdf/flows/region.py,sha256=
|
63
|
+
natural_pdf/flows/region.py,sha256=jRenBFh2ZmFNklNnGkzCsAM0OfMjbP8fo0p7BiVCl_k,31795
|
64
64
|
natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
|
65
65
|
natural_pdf/ocr/engine.py,sha256=SwNlWydtHbrIghV5JD_j5B4-rnjCMYIWUIEARag-zHw,11839
|
66
66
|
natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
|
@@ -81,7 +81,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
|
|
81
81
|
natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
|
82
82
|
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
83
83
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
84
|
-
natural_pdf/selectors/parser.py,sha256=
|
84
|
+
natural_pdf/selectors/parser.py,sha256=W1gZuBhGy2uHqCoExzCAFbsiVMKYSgUfCc9cr4rO1V0,37540
|
85
85
|
natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
|
86
86
|
natural_pdf/tables/result.py,sha256=hrGIWDkImpdxsGzugcQKU-qrTgHwwfOigJDFdYl8aUc,3994
|
87
87
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
@@ -90,14 +90,15 @@ natural_pdf/utils/bidi_mirror.py,sha256=jJEES0xDrMfo5Me8kHMxHv4COS51PitnYi2EvKv3
|
|
90
90
|
natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,995
|
91
91
|
natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
|
92
92
|
natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
|
93
|
+
natural_pdf/utils/layout.py,sha256=nCmXiflatJoh-uFtDzfNnlYuRu7Pu9voJWUek6rEMzI,752
|
93
94
|
natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
|
94
95
|
natural_pdf/utils/packaging.py,sha256=e7U2wWvpunlAWpPFexNkD_c4dYbPp5LcKo7og4bNGvk,22411
|
95
96
|
natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
|
96
|
-
natural_pdf/utils/text_extraction.py,sha256=
|
97
|
+
natural_pdf/utils/text_extraction.py,sha256=HYWlYGPfafwzsuMyfL5oQhvcD4NobbvC_aCpcDs9I64,13901
|
97
98
|
natural_pdf/utils/visualization.py,sha256=olDkWtuVzP0NxRg0CP0DL-eXNCY7Bs-SH-2Xn-cjbo0,9370
|
98
99
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
99
100
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
100
|
-
natural_pdf-0.1.
|
101
|
+
natural_pdf-0.1.38.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
101
102
|
optimization/memory_comparison.py,sha256=F90D_5WhliSGAct_lyx93xd4q4F-jeo8QpGyDr8tmNw,6543
|
102
103
|
optimization/pdf_analyzer.py,sha256=xf6h-FNlqCpsm8NriXcs_bQZOB8eQkxgGGKVRL_jgCM,19347
|
103
104
|
optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
|
@@ -114,8 +115,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
114
115
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
115
116
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
116
117
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
117
|
-
natural_pdf-0.1.
|
118
|
-
natural_pdf-0.1.
|
119
|
-
natural_pdf-0.1.
|
120
|
-
natural_pdf-0.1.
|
121
|
-
natural_pdf-0.1.
|
118
|
+
natural_pdf-0.1.38.dist-info/METADATA,sha256=7a2BfP1oBRbUDUm_9t-3jCsw9BGjIiGyoFwGQyDvcVo,6739
|
119
|
+
natural_pdf-0.1.38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
120
|
+
natural_pdf-0.1.38.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
121
|
+
natural_pdf-0.1.38.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
122
|
+
natural_pdf-0.1.38.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|