natural-pdf 0.1.36__py3-none-any.whl → 0.1.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +1053 -26
- natural_pdf/core/page.py +274 -46
- natural_pdf/core/pdf.py +116 -30
- natural_pdf/elements/collections.py +48 -7
- natural_pdf/elements/region.py +179 -17
- natural_pdf/elements/text.py +36 -2
- natural_pdf/flows/region.py +128 -26
- natural_pdf/selectors/parser.py +24 -0
- natural_pdf/utils/layout.py +26 -0
- natural_pdf/utils/text_extraction.py +76 -1
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -103,6 +103,7 @@ except ImportError:
|
|
103
103
|
from collections.abc import Sequence
|
104
104
|
|
105
105
|
|
106
|
+
|
106
107
|
class _LazyPageList(Sequence):
|
107
108
|
"""A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
|
108
109
|
|
@@ -121,6 +122,7 @@ class _LazyPageList(Sequence):
|
|
121
122
|
_font_attrs: Font attributes to use when creating pages.
|
122
123
|
_cache: List of cached Page objects (None until accessed).
|
123
124
|
_load_text: Whether to load text layer when creating pages.
|
125
|
+
_indices: Optional range of indices this list represents (for slices).
|
124
126
|
|
125
127
|
Example:
|
126
128
|
```python
|
@@ -130,7 +132,7 @@ class _LazyPageList(Sequence):
|
|
130
132
|
last_page = pdf.pages[-1] # Creates another Page object
|
131
133
|
|
132
134
|
# Slicing works too
|
133
|
-
first_three = pdf.pages[0:3] #
|
135
|
+
first_three = pdf.pages[0:3] # Returns another lazy list
|
134
136
|
|
135
137
|
# Iteration creates all pages
|
136
138
|
for page in pdf.pages: # Each page created as needed
|
@@ -139,30 +141,71 @@ class _LazyPageList(Sequence):
|
|
139
141
|
"""
|
140
142
|
|
141
143
|
def __init__(
|
142
|
-
self,
|
144
|
+
self,
|
145
|
+
parent_pdf: "PDF",
|
146
|
+
plumber_pdf: "pdfplumber.PDF",
|
147
|
+
font_attrs=None,
|
148
|
+
load_text=True,
|
149
|
+
indices: Optional[List[int]] = None
|
143
150
|
):
|
144
151
|
self._parent_pdf = parent_pdf
|
145
152
|
self._plumber_pdf = plumber_pdf
|
146
153
|
self._font_attrs = font_attrs
|
147
|
-
# One slot per pdfplumber page – initially all None
|
148
|
-
self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
|
149
154
|
self._load_text = load_text
|
155
|
+
|
156
|
+
# If indices is provided, this is a sliced view
|
157
|
+
if indices is not None:
|
158
|
+
self._indices = indices
|
159
|
+
self._cache = [None] * len(indices)
|
160
|
+
else:
|
161
|
+
# Full PDF - one slot per pdfplumber page
|
162
|
+
self._indices = list(range(len(plumber_pdf.pages)))
|
163
|
+
self._cache = [None] * len(plumber_pdf.pages)
|
150
164
|
|
151
165
|
# Internal helper -----------------------------------------------------
|
152
166
|
def _create_page(self, index: int) -> "Page":
|
167
|
+
"""Create and cache a page at the given index within this list."""
|
153
168
|
cached = self._cache[index]
|
154
169
|
if cached is None:
|
155
170
|
# Import here to avoid circular import problems
|
156
171
|
from natural_pdf.core.page import Page
|
157
172
|
|
158
|
-
|
173
|
+
# Get the actual page index in the full PDF
|
174
|
+
actual_page_index = self._indices[index]
|
175
|
+
plumber_page = self._plumber_pdf.pages[actual_page_index]
|
159
176
|
cached = Page(
|
160
177
|
plumber_page,
|
161
178
|
parent=self._parent_pdf,
|
162
|
-
index=
|
179
|
+
index=actual_page_index,
|
163
180
|
font_attrs=self._font_attrs,
|
164
181
|
load_text=self._load_text,
|
165
182
|
)
|
183
|
+
|
184
|
+
# Apply any stored exclusions to the newly created page
|
185
|
+
if hasattr(self._parent_pdf, '_exclusions'):
|
186
|
+
for exclusion_data in self._parent_pdf._exclusions:
|
187
|
+
exclusion_func, label = exclusion_data
|
188
|
+
try:
|
189
|
+
cached.add_exclusion(exclusion_func, label=label)
|
190
|
+
except Exception as e:
|
191
|
+
logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
|
192
|
+
|
193
|
+
# Apply any stored regions to the newly created page
|
194
|
+
if hasattr(self._parent_pdf, '_regions'):
|
195
|
+
for region_data in self._parent_pdf._regions:
|
196
|
+
region_func, name = region_data
|
197
|
+
try:
|
198
|
+
region_instance = region_func(cached)
|
199
|
+
if region_instance and hasattr(region_instance, '__class__'):
|
200
|
+
# Check if it's a Region-like object (avoid importing Region here)
|
201
|
+
cached.add_region(region_instance, name=name, source="named")
|
202
|
+
elif region_instance is not None:
|
203
|
+
logger.warning(
|
204
|
+
f"Region function did not return a valid Region for page {cached.number}"
|
205
|
+
)
|
206
|
+
except Exception as e:
|
207
|
+
logger.warning(f"Failed to apply region to page {cached.number}: {e}")
|
208
|
+
|
166
209
|
self._cache[index] = cached
|
167
210
|
return cached
|
168
211
|
|
@@ -172,9 +215,18 @@ class _LazyPageList(Sequence):
|
|
172
215
|
|
173
216
|
def __getitem__(self, key):
|
174
217
|
if isinstance(key, slice):
|
175
|
-
#
|
176
|
-
|
177
|
-
|
218
|
+
# Get the slice of our current indices
|
219
|
+
slice_indices = range(*key.indices(len(self)))
|
220
|
+
# Extract the actual page indices for this slice
|
221
|
+
actual_indices = [self._indices[i] for i in slice_indices]
|
222
|
+
# Return a new lazy list for the slice
|
223
|
+
return _LazyPageList(
|
224
|
+
self._parent_pdf,
|
225
|
+
self._plumber_pdf,
|
226
|
+
font_attrs=self._font_attrs,
|
227
|
+
load_text=self._load_text,
|
228
|
+
indices=actual_indices
|
229
|
+
)
|
178
230
|
elif isinstance(key, int):
|
179
231
|
if key < 0:
|
180
232
|
key += len(self)
|
@@ -556,12 +608,18 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
556
608
|
raise AttributeError("PDF pages not yet initialized.")
|
557
609
|
|
558
610
|
self._exclusions = []
|
559
|
-
|
560
|
-
|
611
|
+
|
612
|
+
# Clear exclusions only from already-created (cached) pages to avoid forcing page creation
|
613
|
+
for i in range(len(self._pages)):
|
614
|
+
if self._pages._cache[i] is not None: # Only clear from existing pages
|
615
|
+
try:
|
616
|
+
self._pages._cache[i].clear_exclusions()
|
617
|
+
except Exception as e:
|
618
|
+
logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
|
561
619
|
return self
|
562
620
|
|
563
621
|
def add_exclusion(
|
564
|
-
self, exclusion_func
|
622
|
+
self, exclusion_func, label: str = None
|
565
623
|
) -> "PDF":
|
566
624
|
"""Add an exclusion function to the PDF.
|
567
625
|
|
@@ -607,11 +665,36 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
607
665
|
if not hasattr(self, "_pages"):
|
608
666
|
raise AttributeError("PDF pages not yet initialized.")
|
609
667
|
|
668
|
+
# ------------------------------------------------------------------
|
669
|
+
# Support selector strings and ElementCollection objects directly.
|
670
|
+
# Store exclusion and apply only to already-created pages.
|
671
|
+
# ------------------------------------------------------------------
|
672
|
+
from natural_pdf.elements.collections import ElementCollection # local import
|
673
|
+
|
674
|
+
if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
|
675
|
+
# Store for bookkeeping and lazy application
|
676
|
+
self._exclusions.append((exclusion_func, label))
|
677
|
+
|
678
|
+
# Apply only to already-created (cached) pages to avoid forcing page creation
|
679
|
+
for i in range(len(self._pages)):
|
680
|
+
if self._pages._cache[i] is not None: # Only apply to existing pages
|
681
|
+
try:
|
682
|
+
self._pages._cache[i].add_exclusion(exclusion_func, label=label)
|
683
|
+
except Exception as e:
|
684
|
+
logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
|
685
|
+
return self
|
686
|
+
|
687
|
+
# Fallback to original callable / Region behaviour ------------------
|
610
688
|
exclusion_data = (exclusion_func, label)
|
611
689
|
self._exclusions.append(exclusion_data)
|
612
690
|
|
613
|
-
|
614
|
-
|
691
|
+
# Apply only to already-created (cached) pages to avoid forcing page creation
|
692
|
+
for i in range(len(self._pages)):
|
693
|
+
if self._pages._cache[i] is not None: # Only apply to existing pages
|
694
|
+
try:
|
695
|
+
self._pages._cache[i].add_exclusion(exclusion_func, label=label)
|
696
|
+
except Exception as e:
|
697
|
+
logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
|
615
698
|
|
616
699
|
return self
|
617
700
|
|
@@ -853,7 +936,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
853
936
|
Add a region function to the PDF.
|
854
937
|
|
855
938
|
Args:
|
856
|
-
region_func: A function that takes a Page and returns a Region, or None
|
857
939
|
region_func: A function that takes a Page and returns a Region, or None
|
858
940
|
name: Optional name for the region
|
859
941
|
|
@@ -866,17 +948,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
866
948
|
region_data = (region_func, name)
|
867
949
|
self._regions.append(region_data)
|
868
950
|
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
951
|
+
# Apply only to already-created (cached) pages to avoid forcing page creation
|
952
|
+
for i in range(len(self._pages)):
|
953
|
+
if self._pages._cache[i] is not None: # Only apply to existing pages
|
954
|
+
page = self._pages._cache[i]
|
955
|
+
try:
|
956
|
+
region_instance = region_func(page)
|
957
|
+
if region_instance and isinstance(region_instance, Region):
|
958
|
+
page.add_region(region_instance, name=name, source="named")
|
959
|
+
elif region_instance is not None:
|
960
|
+
logger.warning(
|
961
|
+
f"Region function did not return a valid Region for page {page.number}"
|
962
|
+
)
|
963
|
+
except Exception as e:
|
964
|
+
logger.error(f"Error adding region for page {page.number}: {e}")
|
880
965
|
|
881
966
|
return self
|
882
967
|
|
@@ -1697,10 +1782,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1697
1782
|
|
1698
1783
|
if isinstance(key, slice):
|
1699
1784
|
from natural_pdf.elements.collections import PageCollection
|
1700
|
-
|
1701
|
-
|
1702
|
-
|
1703
|
-
|
1785
|
+
# Use the lazy page list's slicing which returns another _LazyPageList
|
1786
|
+
lazy_slice = self._pages[key]
|
1787
|
+
# Wrap in PageCollection for compatibility
|
1788
|
+
return PageCollection(lazy_slice)
|
1789
|
+
elif isinstance(key, int):
|
1704
1790
|
if 0 <= key < len(self._pages):
|
1705
1791
|
return self._pages[key]
|
1706
1792
|
else:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import hashlib
|
2
2
|
import logging
|
3
|
-
from collections.abc import MutableSequence
|
3
|
+
from collections.abc import MutableSequence, Sequence
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import (
|
6
6
|
TYPE_CHECKING,
|
@@ -369,6 +369,7 @@ class ElementCollection(
|
|
369
369
|
preserve_whitespace: bool = True,
|
370
370
|
use_exclusions: bool = True,
|
371
371
|
strip: Optional[bool] = None,
|
372
|
+
content_filter=None,
|
372
373
|
**kwargs,
|
373
374
|
) -> str:
|
374
375
|
"""
|
@@ -379,6 +380,10 @@ class ElementCollection(
|
|
379
380
|
preserve_whitespace: Deprecated. Use layout=False for simple joining.
|
380
381
|
use_exclusions: Deprecated. Exclusions should be applied *before* creating
|
381
382
|
the collection or by filtering the collection itself.
|
383
|
+
content_filter: Optional content filter to exclude specific text patterns. Can be:
|
384
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
385
|
+
- A callable that takes text and returns True to KEEP the character
|
386
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
382
387
|
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
383
388
|
`chars_to_textmap` function ONLY if `layout=True` is passed.
|
384
389
|
See Page.extract_text docstring for common parameters.
|
@@ -412,6 +417,11 @@ class ElementCollection(
|
|
412
417
|
getattr(el, "text", "") for el in text_elements
|
413
418
|
) # Fallback to simple join of word text
|
414
419
|
|
420
|
+
# Apply content filtering if provided
|
421
|
+
if content_filter is not None:
|
422
|
+
from natural_pdf.utils.text_extraction import _apply_content_filter
|
423
|
+
all_char_dicts = _apply_content_filter(all_char_dicts, content_filter)
|
424
|
+
|
415
425
|
# Check if layout is requested
|
416
426
|
use_layout = kwargs.get("layout", False)
|
417
427
|
|
@@ -2041,14 +2051,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2041
2051
|
Provides methods for batch operations on these pages.
|
2042
2052
|
"""
|
2043
2053
|
|
2044
|
-
def __init__(self, pages: List[P]):
|
2054
|
+
def __init__(self, pages: Union[List[P], Sequence[P]]):
|
2045
2055
|
"""
|
2046
2056
|
Initialize a page collection.
|
2047
2057
|
|
2048
2058
|
Args:
|
2049
|
-
pages: List of Page objects
|
2059
|
+
pages: List or sequence of Page objects (can be lazy)
|
2050
2060
|
"""
|
2051
|
-
|
2061
|
+
# Store the sequence as-is to preserve lazy behavior
|
2062
|
+
# Only convert to list if we need list-specific operations
|
2063
|
+
if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
|
2064
|
+
self.pages = pages
|
2065
|
+
else:
|
2066
|
+
# Fallback for non-sequence types
|
2067
|
+
self.pages = list(pages)
|
2052
2068
|
|
2053
2069
|
def __len__(self) -> int:
|
2054
2070
|
"""Return the number of pages in the collection."""
|
@@ -2068,6 +2084,31 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2068
2084
|
"""Return a string representation showing the page count."""
|
2069
2085
|
return f"<PageCollection(count={len(self)})>"
|
2070
2086
|
|
2087
|
+
def _get_items_for_apply(self) -> Iterator[P]:
|
2088
|
+
"""
|
2089
|
+
Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
|
2090
|
+
|
2091
|
+
Returns an iterator that yields pages on-demand rather than materializing
|
2092
|
+
all pages at once, maintaining the lazy loading behavior.
|
2093
|
+
"""
|
2094
|
+
return iter(self.pages)
|
2095
|
+
|
2096
|
+
def _get_page_indices(self) -> List[int]:
|
2097
|
+
"""
|
2098
|
+
Get page indices without forcing materialization of pages.
|
2099
|
+
|
2100
|
+
Returns:
|
2101
|
+
List of page indices for the pages in this collection.
|
2102
|
+
"""
|
2103
|
+
# Handle different types of page sequences efficiently
|
2104
|
+
if hasattr(self.pages, '_indices'):
|
2105
|
+
# If it's a _LazyPageList (or slice), get indices directly
|
2106
|
+
return list(self.pages._indices)
|
2107
|
+
else:
|
2108
|
+
# Fallback: if pages are already materialized, get indices normally
|
2109
|
+
# This will force materialization but only if pages aren't lazy
|
2110
|
+
return [p.index for p in self.pages]
|
2111
|
+
|
2071
2112
|
def extract_text(
|
2072
2113
|
self,
|
2073
2114
|
keep_blank_chars: bool = True,
|
@@ -2162,7 +2203,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2162
2203
|
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
2163
2204
|
|
2164
2205
|
# Get the 0-based indices of the pages in this collection
|
2165
|
-
page_indices =
|
2206
|
+
page_indices = self._get_page_indices()
|
2166
2207
|
|
2167
2208
|
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
2168
2209
|
|
@@ -2364,7 +2405,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2364
2405
|
"Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
|
2365
2406
|
)
|
2366
2407
|
|
2367
|
-
page_indices =
|
2408
|
+
page_indices = self._get_page_indices()
|
2368
2409
|
logger.info(
|
2369
2410
|
f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
|
2370
2411
|
)
|
@@ -2790,7 +2831,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2790
2831
|
)
|
2791
2832
|
|
2792
2833
|
# Get the 0-based indices of the pages in this collection
|
2793
|
-
page_indices =
|
2834
|
+
page_indices = self._get_page_indices()
|
2794
2835
|
logger.info(
|
2795
2836
|
f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
|
2796
2837
|
)
|