natural-pdf 0.1.36__py3-none-any.whl → 0.1.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/pdf.py CHANGED
@@ -103,6 +103,7 @@ except ImportError:
103
103
  from collections.abc import Sequence
104
104
 
105
105
 
106
+
106
107
  class _LazyPageList(Sequence):
107
108
  """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
108
109
 
@@ -121,6 +122,7 @@ class _LazyPageList(Sequence):
121
122
  _font_attrs: Font attributes to use when creating pages.
122
123
  _cache: List of cached Page objects (None until accessed).
123
124
  _load_text: Whether to load text layer when creating pages.
125
+ _indices: Optional range of indices this list represents (for slices).
124
126
 
125
127
  Example:
126
128
  ```python
@@ -130,7 +132,7 @@ class _LazyPageList(Sequence):
130
132
  last_page = pdf.pages[-1] # Creates another Page object
131
133
 
132
134
  # Slicing works too
133
- first_three = pdf.pages[0:3] # Creates 3 Page objects
135
+ first_three = pdf.pages[0:3] # Returns another lazy list
134
136
 
135
137
  # Iteration creates all pages
136
138
  for page in pdf.pages: # Each page created as needed
@@ -139,30 +141,71 @@ class _LazyPageList(Sequence):
139
141
  """
140
142
 
141
143
  def __init__(
142
- self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True
144
+ self,
145
+ parent_pdf: "PDF",
146
+ plumber_pdf: "pdfplumber.PDF",
147
+ font_attrs=None,
148
+ load_text=True,
149
+ indices: Optional[List[int]] = None
143
150
  ):
144
151
  self._parent_pdf = parent_pdf
145
152
  self._plumber_pdf = plumber_pdf
146
153
  self._font_attrs = font_attrs
147
- # One slot per pdfplumber page – initially all None
148
- self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
149
154
  self._load_text = load_text
155
+
156
+ # If indices is provided, this is a sliced view
157
+ if indices is not None:
158
+ self._indices = indices
159
+ self._cache = [None] * len(indices)
160
+ else:
161
+ # Full PDF - one slot per pdfplumber page
162
+ self._indices = list(range(len(plumber_pdf.pages)))
163
+ self._cache = [None] * len(plumber_pdf.pages)
150
164
 
151
165
  # Internal helper -----------------------------------------------------
152
166
  def _create_page(self, index: int) -> "Page":
167
+ """Create and cache a page at the given index within this list."""
153
168
  cached = self._cache[index]
154
169
  if cached is None:
155
170
  # Import here to avoid circular import problems
156
171
  from natural_pdf.core.page import Page
157
172
 
158
- plumber_page = self._plumber_pdf.pages[index]
173
+ # Get the actual page index in the full PDF
174
+ actual_page_index = self._indices[index]
175
+ plumber_page = self._plumber_pdf.pages[actual_page_index]
159
176
  cached = Page(
160
177
  plumber_page,
161
178
  parent=self._parent_pdf,
162
- index=index,
179
+ index=actual_page_index,
163
180
  font_attrs=self._font_attrs,
164
181
  load_text=self._load_text,
165
182
  )
183
+
184
+ # Apply any stored exclusions to the newly created page
185
+ if hasattr(self._parent_pdf, '_exclusions'):
186
+ for exclusion_data in self._parent_pdf._exclusions:
187
+ exclusion_func, label = exclusion_data
188
+ try:
189
+ cached.add_exclusion(exclusion_func, label=label)
190
+ except Exception as e:
191
+ logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
192
+
193
+ # Apply any stored regions to the newly created page
194
+ if hasattr(self._parent_pdf, '_regions'):
195
+ for region_data in self._parent_pdf._regions:
196
+ region_func, name = region_data
197
+ try:
198
+ region_instance = region_func(cached)
199
+ if region_instance and hasattr(region_instance, '__class__'):
200
+ # Check if it's a Region-like object (avoid importing Region here)
201
+ cached.add_region(region_instance, name=name, source="named")
202
+ elif region_instance is not None:
203
+ logger.warning(
204
+ f"Region function did not return a valid Region for page {cached.number}"
205
+ )
206
+ except Exception as e:
207
+ logger.warning(f"Failed to apply region to page {cached.number}: {e}")
208
+
166
209
  self._cache[index] = cached
167
210
  return cached
168
211
 
@@ -172,9 +215,18 @@ class _LazyPageList(Sequence):
172
215
 
173
216
  def __getitem__(self, key):
174
217
  if isinstance(key, slice):
175
- # Materialise pages for slice lazily as well
176
- indices = range(*key.indices(len(self)))
177
- return [self._create_page(i) for i in indices]
218
+ # Get the slice of our current indices
219
+ slice_indices = range(*key.indices(len(self)))
220
+ # Extract the actual page indices for this slice
221
+ actual_indices = [self._indices[i] for i in slice_indices]
222
+ # Return a new lazy list for the slice
223
+ return _LazyPageList(
224
+ self._parent_pdf,
225
+ self._plumber_pdf,
226
+ font_attrs=self._font_attrs,
227
+ load_text=self._load_text,
228
+ indices=actual_indices
229
+ )
178
230
  elif isinstance(key, int):
179
231
  if key < 0:
180
232
  key += len(self)
@@ -556,12 +608,18 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
556
608
  raise AttributeError("PDF pages not yet initialized.")
557
609
 
558
610
  self._exclusions = []
559
- for page in self._pages:
560
- page.clear_exclusions()
611
+
612
+ # Clear exclusions only from already-created (cached) pages to avoid forcing page creation
613
+ for i in range(len(self._pages)):
614
+ if self._pages._cache[i] is not None: # Only clear from existing pages
615
+ try:
616
+ self._pages._cache[i].clear_exclusions()
617
+ except Exception as e:
618
+ logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
561
619
  return self
562
620
 
563
621
  def add_exclusion(
564
- self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
622
+ self, exclusion_func, label: str = None
565
623
  ) -> "PDF":
566
624
  """Add an exclusion function to the PDF.
567
625
 
@@ -607,11 +665,36 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
607
665
  if not hasattr(self, "_pages"):
608
666
  raise AttributeError("PDF pages not yet initialized.")
609
667
 
668
+ # ------------------------------------------------------------------
669
+ # Support selector strings and ElementCollection objects directly.
670
+ # Store exclusion and apply only to already-created pages.
671
+ # ------------------------------------------------------------------
672
+ from natural_pdf.elements.collections import ElementCollection # local import
673
+
674
+ if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
675
+ # Store for bookkeeping and lazy application
676
+ self._exclusions.append((exclusion_func, label))
677
+
678
+ # Apply only to already-created (cached) pages to avoid forcing page creation
679
+ for i in range(len(self._pages)):
680
+ if self._pages._cache[i] is not None: # Only apply to existing pages
681
+ try:
682
+ self._pages._cache[i].add_exclusion(exclusion_func, label=label)
683
+ except Exception as e:
684
+ logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
685
+ return self
686
+
687
+ # Fallback to original callable / Region behaviour ------------------
610
688
  exclusion_data = (exclusion_func, label)
611
689
  self._exclusions.append(exclusion_data)
612
690
 
613
- for page in self._pages:
614
- page.add_exclusion(exclusion_func, label=label)
691
+ # Apply only to already-created (cached) pages to avoid forcing page creation
692
+ for i in range(len(self._pages)):
693
+ if self._pages._cache[i] is not None: # Only apply to existing pages
694
+ try:
695
+ self._pages._cache[i].add_exclusion(exclusion_func, label=label)
696
+ except Exception as e:
697
+ logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
615
698
 
616
699
  return self
617
700
 
@@ -853,7 +936,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
853
936
  Add a region function to the PDF.
854
937
 
855
938
  Args:
856
- region_func: A function that takes a Page and returns a Region, or None
857
939
  region_func: A function that takes a Page and returns a Region, or None
858
940
  name: Optional name for the region
859
941
 
@@ -866,17 +948,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
866
948
  region_data = (region_func, name)
867
949
  self._regions.append(region_data)
868
950
 
869
- for page in self._pages:
870
- try:
871
- region_instance = region_func(page)
872
- if region_instance and isinstance(region_instance, Region):
873
- page.add_region(region_instance, name=name, source="named")
874
- elif region_instance is not None:
875
- logger.warning(
876
- f"Region function did not return a valid Region for page {page.number}"
877
- )
878
- except Exception as e:
879
- logger.error(f"Error adding region for page {page.number}: {e}")
951
+ # Apply only to already-created (cached) pages to avoid forcing page creation
952
+ for i in range(len(self._pages)):
953
+ if self._pages._cache[i] is not None: # Only apply to existing pages
954
+ page = self._pages._cache[i]
955
+ try:
956
+ region_instance = region_func(page)
957
+ if region_instance and isinstance(region_instance, Region):
958
+ page.add_region(region_instance, name=name, source="named")
959
+ elif region_instance is not None:
960
+ logger.warning(
961
+ f"Region function did not return a valid Region for page {page.number}"
962
+ )
963
+ except Exception as e:
964
+ logger.error(f"Error adding region for page {page.number}: {e}")
880
965
 
881
966
  return self
882
967
 
@@ -1697,10 +1782,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1697
1782
 
1698
1783
  if isinstance(key, slice):
1699
1784
  from natural_pdf.elements.collections import PageCollection
1700
-
1701
- return PageCollection(self._pages[key])
1702
-
1703
- if isinstance(key, int):
1785
+ # Use the lazy page list's slicing which returns another _LazyPageList
1786
+ lazy_slice = self._pages[key]
1787
+ # Wrap in PageCollection for compatibility
1788
+ return PageCollection(lazy_slice)
1789
+ elif isinstance(key, int):
1704
1790
  if 0 <= key < len(self._pages):
1705
1791
  return self._pages[key]
1706
1792
  else:
@@ -1,6 +1,6 @@
1
1
  import hashlib
2
2
  import logging
3
- from collections.abc import MutableSequence
3
+ from collections.abc import MutableSequence, Sequence
4
4
  from pathlib import Path
5
5
  from typing import (
6
6
  TYPE_CHECKING,
@@ -369,6 +369,7 @@ class ElementCollection(
369
369
  preserve_whitespace: bool = True,
370
370
  use_exclusions: bool = True,
371
371
  strip: Optional[bool] = None,
372
+ content_filter=None,
372
373
  **kwargs,
373
374
  ) -> str:
374
375
  """
@@ -379,6 +380,10 @@ class ElementCollection(
379
380
  preserve_whitespace: Deprecated. Use layout=False for simple joining.
380
381
  use_exclusions: Deprecated. Exclusions should be applied *before* creating
381
382
  the collection or by filtering the collection itself.
383
+ content_filter: Optional content filter to exclude specific text patterns. Can be:
384
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
385
+ - A callable that takes text and returns True to KEEP the character
386
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
382
387
  **kwargs: Additional layout parameters passed directly to pdfplumber's
383
388
  `chars_to_textmap` function ONLY if `layout=True` is passed.
384
389
  See Page.extract_text docstring for common parameters.
@@ -412,6 +417,11 @@ class ElementCollection(
412
417
  getattr(el, "text", "") for el in text_elements
413
418
  ) # Fallback to simple join of word text
414
419
 
420
+ # Apply content filtering if provided
421
+ if content_filter is not None:
422
+ from natural_pdf.utils.text_extraction import _apply_content_filter
423
+ all_char_dicts = _apply_content_filter(all_char_dicts, content_filter)
424
+
415
425
  # Check if layout is requested
416
426
  use_layout = kwargs.get("layout", False)
417
427
 
@@ -2041,14 +2051,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2041
2051
  Provides methods for batch operations on these pages.
2042
2052
  """
2043
2053
 
2044
- def __init__(self, pages: List[P]):
2054
+ def __init__(self, pages: Union[List[P], Sequence[P]]):
2045
2055
  """
2046
2056
  Initialize a page collection.
2047
2057
 
2048
2058
  Args:
2049
- pages: List of Page objects
2059
+ pages: List or sequence of Page objects (can be lazy)
2050
2060
  """
2051
- self.pages = pages
2061
+ # Store the sequence as-is to preserve lazy behavior
2062
+ # Only convert to list if we need list-specific operations
2063
+ if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
2064
+ self.pages = pages
2065
+ else:
2066
+ # Fallback for non-sequence types
2067
+ self.pages = list(pages)
2052
2068
 
2053
2069
  def __len__(self) -> int:
2054
2070
  """Return the number of pages in the collection."""
@@ -2068,6 +2084,31 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2068
2084
  """Return a string representation showing the page count."""
2069
2085
  return f"<PageCollection(count={len(self)})>"
2070
2086
 
2087
+ def _get_items_for_apply(self) -> Iterator[P]:
2088
+ """
2089
+ Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
2090
+
2091
+ Returns an iterator that yields pages on-demand rather than materializing
2092
+ all pages at once, maintaining the lazy loading behavior.
2093
+ """
2094
+ return iter(self.pages)
2095
+
2096
+ def _get_page_indices(self) -> List[int]:
2097
+ """
2098
+ Get page indices without forcing materialization of pages.
2099
+
2100
+ Returns:
2101
+ List of page indices for the pages in this collection.
2102
+ """
2103
+ # Handle different types of page sequences efficiently
2104
+ if hasattr(self.pages, '_indices'):
2105
+ # If it's a _LazyPageList (or slice), get indices directly
2106
+ return list(self.pages._indices)
2107
+ else:
2108
+ # Fallback: if pages are already materialized, get indices normally
2109
+ # This will force materialization but only if pages aren't lazy
2110
+ return [p.index for p in self.pages]
2111
+
2071
2112
  def extract_text(
2072
2113
  self,
2073
2114
  keep_blank_chars: bool = True,
@@ -2162,7 +2203,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2162
2203
  raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
2163
2204
 
2164
2205
  # Get the 0-based indices of the pages in this collection
2165
- page_indices = [p.index for p in self.pages]
2206
+ page_indices = self._get_page_indices()
2166
2207
 
2167
2208
  logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
2168
2209
 
@@ -2364,7 +2405,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2364
2405
  "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
2365
2406
  )
2366
2407
 
2367
- page_indices = [p.index for p in self.pages]
2408
+ page_indices = self._get_page_indices()
2368
2409
  logger.info(
2369
2410
  f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
2370
2411
  )
@@ -2790,7 +2831,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2790
2831
  )
2791
2832
 
2792
2833
  # Get the 0-based indices of the pages in this collection
2793
- page_indices = [p.index for p in self.pages]
2834
+ page_indices = self._get_page_indices()
2794
2835
  logger.info(
2795
2836
  f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
2796
2837
  )