natural-pdf 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/__init__.py CHANGED
@@ -76,6 +76,9 @@ from natural_pdf.core.page import Page
76
76
  from natural_pdf.core.pdf import PDF
77
77
  from natural_pdf.elements.collections import ElementCollection
78
78
  from natural_pdf.elements.region import Region
79
+ from natural_pdf.flows.flow import Flow
80
+ from natural_pdf.flows.region import FlowRegion
81
+ from natural_pdf.analyzers.guides import Guides
79
82
 
80
83
  ElementCollection = None
81
84
 
@@ -116,6 +119,9 @@ __all__ = [
116
119
  "Page",
117
120
  "Region",
118
121
  "ElementCollection",
122
+ "Flow",
123
+ "FlowRegion",
124
+ "Guides",
119
125
  "TextSearchOptions",
120
126
  "MultiModalSearchOptions",
121
127
  "BaseSearchOptions",
natural_pdf/core/page.py CHANGED
@@ -64,7 +64,6 @@ from natural_pdf.core.element_manager import ElementManager
64
64
  from natural_pdf.describe.mixin import DescribeMixin # Import describe mixin
65
65
  from natural_pdf.elements.base import Element # Import base element
66
66
  from natural_pdf.elements.text import TextElement
67
- from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
68
67
  from natural_pdf.ocr import OCRManager, OCROptions
69
68
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
70
69
  from natural_pdf.qa import DocumentQA, get_qa_engine
@@ -76,8 +75,9 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
76
75
 
77
76
  # --- End Classification Imports --- #
78
77
 
79
-
80
- # --- End Shape Detection Mixin --- #
78
+ # --- Text update mixin import --- #
79
+ from natural_pdf.text_mixin import TextMixin
80
+ from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
81
81
 
82
82
 
83
83
  try:
@@ -92,7 +92,7 @@ except ImportError:
92
92
  logger = logging.getLogger(__name__)
93
93
 
94
94
 
95
- class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
95
+ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
96
96
  """Enhanced Page wrapper built on top of pdfplumber.Page.
97
97
 
98
98
  This class provides a fluent interface for working with PDF pages,
@@ -1655,7 +1655,27 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1655
1655
  table_settings.setdefault("join_x_tolerance", join)
1656
1656
  table_settings.setdefault("join_y_tolerance", join)
1657
1657
 
1658
- return self._page.extract_tables(table_settings)
1658
+ raw_tables = self._page.extract_tables(table_settings)
1659
+
1660
+ # Apply RTL text processing to all extracted tables
1661
+ if raw_tables:
1662
+ processed_tables = []
1663
+ for table in raw_tables:
1664
+ processed_table = []
1665
+ for row in table:
1666
+ processed_row = []
1667
+ for cell in row:
1668
+ if cell is not None:
1669
+ # Apply RTL text processing to each cell
1670
+ rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
1671
+ processed_row.append(rtl_processed_cell)
1672
+ else:
1673
+ processed_row.append(cell)
1674
+ processed_table.append(processed_row)
1675
+ processed_tables.append(processed_table)
1676
+ return processed_tables
1677
+
1678
+ return raw_tables
1659
1679
  else:
1660
1680
  raise ValueError(
1661
1681
  f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
@@ -2866,25 +2886,25 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2866
2886
  logger.info(f"Searchable PDF saved to: {output_path_str}")
2867
2887
 
2868
2888
  # --- Added correct_ocr method ---
2869
- def correct_ocr(
2889
+ def update_text(
2870
2890
  self,
2871
- correction_callback: Callable[[Any], Optional[str]],
2872
- selector: Optional[str] = "text[source=ocr]",
2891
+ transform: Callable[[Any], Optional[str]],
2892
+ selector: str = "text",
2873
2893
  max_workers: Optional[int] = None,
2874
2894
  progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
2875
2895
  ) -> "Page": # Return self for chaining
2876
2896
  """
2877
- Applies corrections to OCR-generated text elements on this page
2897
+ Applies corrections to text elements on this page
2878
2898
  using a user-provided callback function, potentially in parallel.
2879
2899
 
2880
- Finds text elements on this page whose 'source' attribute starts
2881
- with 'ocr' and calls the `correction_callback` for each, passing the
2882
- element itself. Updates the element's text if the callback returns
2883
- a new string.
2900
+ Finds text elements on this page matching the *selector* argument and
2901
+ calls the ``transform`` for each, passing the element itself.
2902
+ Updates the element's text if the callback returns a new string.
2884
2903
 
2885
2904
  Args:
2886
- correction_callback: A function accepting an element and returning
2887
- `Optional[str]` (new text or None).
2905
+ transform: A function accepting an element and returning
2906
+ `Optional[str]` (new text or None).
2907
+ selector: CSS-like selector string to match text elements.
2888
2908
  max_workers: The maximum number of threads to use for parallel execution.
2889
2909
  If None or 0 or 1, runs sequentially.
2890
2910
  progress_callback: Optional callback function to call after processing each element.
@@ -2893,21 +2913,21 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2893
2913
  Self for method chaining.
2894
2914
  """
2895
2915
  logger.info(
2896
- f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
2916
+ f"Page {self.number}: Starting text update with callback '{transform.__name__}' (max_workers={max_workers}) and selector='{selector}'"
2897
2917
  )
2898
2918
 
2899
2919
  target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
2900
2920
  target_elements = target_elements_collection.elements # Get the list
2901
2921
 
2902
2922
  if not target_elements:
2903
- logger.info(f"Page {self.number}: No OCR elements found to correct.")
2923
+ logger.info(f"Page {self.number}: No text elements found to update.")
2904
2924
  return self
2905
2925
 
2906
2926
  element_pbar = None
2907
2927
  try:
2908
2928
  element_pbar = tqdm(
2909
2929
  total=len(target_elements),
2910
- desc=f"Correcting OCR Page {self.number}",
2930
+ desc=f"Updating text Page {self.number}",
2911
2931
  unit="element",
2912
2932
  leave=False,
2913
2933
  )
@@ -2921,7 +2941,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2921
2941
  try:
2922
2942
  current_text = getattr(element, "text", None)
2923
2943
  # Call the user-provided callback
2924
- corrected_text = correction_callback(element)
2944
+ corrected_text = transform(element)
2925
2945
 
2926
2946
  # Validate result type
2927
2947
  if corrected_text is not None and not isinstance(corrected_text, str):
@@ -2956,7 +2976,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2956
2976
  if max_workers is not None and max_workers > 1:
2957
2977
  # --- Parallel execution --- #
2958
2978
  logger.info(
2959
- f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
2979
+ f"Page {self.number}: Running text update in parallel with {max_workers} workers."
2960
2980
  )
2961
2981
  futures = []
2962
2982
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -2992,7 +3012,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2992
3012
 
2993
3013
  else:
2994
3014
  # --- Sequential execution --- #
2995
- logger.info(f"Page {self.number}: Running OCR correction sequentially.")
3015
+ logger.info(f"Page {self.number}: Running text update sequentially.")
2996
3016
  for element in target_elements:
2997
3017
  # Call the task function directly (it handles progress_callback)
2998
3018
  processed_count += 1
@@ -3007,7 +3027,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
3007
3027
  updated_count += 1
3008
3028
 
3009
3029
  logger.info(
3010
- f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
3030
+ f"Page {self.number}: Text update finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
3011
3031
  )
3012
3032
 
3013
3033
  return self # Return self for chaining
@@ -3280,6 +3300,54 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
3280
3300
  )
3281
3301
  return self
3282
3302
 
3303
+ def _apply_rtl_processing_to_text(self, text: str) -> str:
3304
+ """
3305
+ Apply RTL (Right-to-Left) text processing to a string.
3306
+
3307
+ This converts visual order text (as stored in PDFs) to logical order
3308
+ for proper display of Arabic, Hebrew, and other RTL scripts.
3309
+
3310
+ Args:
3311
+ text: Input text string in visual order
3312
+
3313
+ Returns:
3314
+ Text string in logical order
3315
+ """
3316
+ if not text or not text.strip():
3317
+ return text
3318
+
3319
+ # Quick check for RTL characters - if none found, return as-is
3320
+ import unicodedata
3321
+
3322
+ def _contains_rtl(s):
3323
+ return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
3324
+
3325
+ if not _contains_rtl(text):
3326
+ return text
3327
+
3328
+ try:
3329
+ from bidi.algorithm import get_display # type: ignore
3330
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
3331
+
3332
+ # Apply BiDi algorithm to convert from visual to logical order
3333
+ # Process line by line to handle mixed content properly
3334
+ processed_lines = []
3335
+ for line in text.split("\n"):
3336
+ if line.strip():
3337
+ # Determine base direction for this line
3338
+ base_dir = "R" if _contains_rtl(line) else "L"
3339
+ logical_line = get_display(line, base_dir=base_dir)
3340
+ # Apply bracket mirroring for correct logical order
3341
+ processed_lines.append(mirror_brackets(logical_line))
3342
+ else:
3343
+ processed_lines.append(line)
3344
+
3345
+ return "\n".join(processed_lines)
3346
+
3347
+ except (ImportError, Exception):
3348
+ # If bidi library is not available or fails, return original text
3349
+ return text
3350
+
3283
3351
  @property
3284
3352
  def lines(self) -> List[Any]:
3285
3353
  """Get all line elements on this page."""
natural_pdf/core/pdf.py CHANGED
@@ -39,6 +39,10 @@ from natural_pdf.extraction.mixin import ExtractionMixin
39
39
  from natural_pdf.ocr import OCRManager, OCROptions
40
40
  from natural_pdf.selectors.parser import parse_selector
41
41
  from natural_pdf.utils.locks import pdf_render_lock
42
+ from natural_pdf.text_mixin import TextMixin
43
+
44
+ if TYPE_CHECKING:
45
+ from natural_pdf.elements.collections import ElementCollection
42
46
 
43
47
  try:
44
48
  from typing import Any as TypingAny
@@ -103,6 +107,7 @@ except ImportError:
103
107
  from collections.abc import Sequence
104
108
 
105
109
 
110
+
106
111
  class _LazyPageList(Sequence):
107
112
  """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
108
113
 
@@ -121,6 +126,7 @@ class _LazyPageList(Sequence):
121
126
  _font_attrs: Font attributes to use when creating pages.
122
127
  _cache: List of cached Page objects (None until accessed).
123
128
  _load_text: Whether to load text layer when creating pages.
129
+ _indices: Optional range of indices this list represents (for slices).
124
130
 
125
131
  Example:
126
132
  ```python
@@ -130,7 +136,7 @@ class _LazyPageList(Sequence):
130
136
  last_page = pdf.pages[-1] # Creates another Page object
131
137
 
132
138
  # Slicing works too
133
- first_three = pdf.pages[0:3] # Creates 3 Page objects
139
+ first_three = pdf.pages[0:3] # Returns another lazy list
134
140
 
135
141
  # Iteration creates all pages
136
142
  for page in pdf.pages: # Each page created as needed
@@ -139,30 +145,71 @@ class _LazyPageList(Sequence):
139
145
  """
140
146
 
141
147
  def __init__(
142
- self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True
148
+ self,
149
+ parent_pdf: "PDF",
150
+ plumber_pdf: "pdfplumber.PDF",
151
+ font_attrs=None,
152
+ load_text=True,
153
+ indices: Optional[List[int]] = None
143
154
  ):
144
155
  self._parent_pdf = parent_pdf
145
156
  self._plumber_pdf = plumber_pdf
146
157
  self._font_attrs = font_attrs
147
- # One slot per pdfplumber page – initially all None
148
- self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
149
158
  self._load_text = load_text
159
+
160
+ # If indices is provided, this is a sliced view
161
+ if indices is not None:
162
+ self._indices = indices
163
+ self._cache = [None] * len(indices)
164
+ else:
165
+ # Full PDF - one slot per pdfplumber page
166
+ self._indices = list(range(len(plumber_pdf.pages)))
167
+ self._cache = [None] * len(plumber_pdf.pages)
150
168
 
151
169
  # Internal helper -----------------------------------------------------
152
170
  def _create_page(self, index: int) -> "Page":
171
+ """Create and cache a page at the given index within this list."""
153
172
  cached = self._cache[index]
154
173
  if cached is None:
155
174
  # Import here to avoid circular import problems
156
175
  from natural_pdf.core.page import Page
157
176
 
158
- plumber_page = self._plumber_pdf.pages[index]
177
+ # Get the actual page index in the full PDF
178
+ actual_page_index = self._indices[index]
179
+ plumber_page = self._plumber_pdf.pages[actual_page_index]
159
180
  cached = Page(
160
181
  plumber_page,
161
182
  parent=self._parent_pdf,
162
- index=index,
183
+ index=actual_page_index,
163
184
  font_attrs=self._font_attrs,
164
185
  load_text=self._load_text,
165
186
  )
187
+
188
+ # Apply any stored exclusions to the newly created page
189
+ if hasattr(self._parent_pdf, '_exclusions'):
190
+ for exclusion_data in self._parent_pdf._exclusions:
191
+ exclusion_func, label = exclusion_data
192
+ try:
193
+ cached.add_exclusion(exclusion_func, label=label)
194
+ except Exception as e:
195
+ logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
196
+
197
+ # Apply any stored regions to the newly created page
198
+ if hasattr(self._parent_pdf, '_regions'):
199
+ for region_data in self._parent_pdf._regions:
200
+ region_func, name = region_data
201
+ try:
202
+ region_instance = region_func(cached)
203
+ if region_instance and hasattr(region_instance, '__class__'):
204
+ # Check if it's a Region-like object (avoid importing Region here)
205
+ cached.add_region(region_instance, name=name, source="named")
206
+ elif region_instance is not None:
207
+ logger.warning(
208
+ f"Region function did not return a valid Region for page {cached.number}"
209
+ )
210
+ except Exception as e:
211
+ logger.warning(f"Failed to apply region to page {cached.number}: {e}")
212
+
166
213
  self._cache[index] = cached
167
214
  return cached
168
215
 
@@ -172,9 +219,18 @@ class _LazyPageList(Sequence):
172
219
 
173
220
  def __getitem__(self, key):
174
221
  if isinstance(key, slice):
175
- # Materialise pages for slice lazily as well
176
- indices = range(*key.indices(len(self)))
177
- return [self._create_page(i) for i in indices]
222
+ # Get the slice of our current indices
223
+ slice_indices = range(*key.indices(len(self)))
224
+ # Extract the actual page indices for this slice
225
+ actual_indices = [self._indices[i] for i in slice_indices]
226
+ # Return a new lazy list for the slice
227
+ return _LazyPageList(
228
+ self._parent_pdf,
229
+ self._plumber_pdf,
230
+ font_attrs=self._font_attrs,
231
+ load_text=self._load_text,
232
+ indices=actual_indices
233
+ )
178
234
  elif isinstance(key, int):
179
235
  if key < 0:
180
236
  key += len(self)
@@ -195,7 +251,7 @@ class _LazyPageList(Sequence):
195
251
  # --- End Lazy Page List Helper --- #
196
252
 
197
253
 
198
- class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
254
+ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
199
255
  """Enhanced PDF wrapper built on top of pdfplumber.
200
256
 
201
257
  This class provides a fluent interface for working with PDF documents,
@@ -556,8 +612,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
556
612
  raise AttributeError("PDF pages not yet initialized.")
557
613
 
558
614
  self._exclusions = []
559
- for page in self._pages:
560
- page.clear_exclusions()
615
+
616
+ # Clear exclusions only from already-created (cached) pages to avoid forcing page creation
617
+ for i in range(len(self._pages)):
618
+ if self._pages._cache[i] is not None: # Only clear from existing pages
619
+ try:
620
+ self._pages._cache[i].clear_exclusions()
621
+ except Exception as e:
622
+ logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
561
623
  return self
562
624
 
563
625
  def add_exclusion(
@@ -608,25 +670,35 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
608
670
  raise AttributeError("PDF pages not yet initialized.")
609
671
 
610
672
  # ------------------------------------------------------------------
611
- # NEW: Support selector strings and ElementCollection objects directly.
612
- # We simply forward the same object to each page's add_exclusion which
613
- # now knows how to interpret these inputs.
673
+ # Support selector strings and ElementCollection objects directly.
674
+ # Store exclusion and apply only to already-created pages.
614
675
  # ------------------------------------------------------------------
615
676
  from natural_pdf.elements.collections import ElementCollection # local import
616
677
 
617
678
  if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
618
- # Store for bookkeeping
679
+ # Store for bookkeeping and lazy application
619
680
  self._exclusions.append((exclusion_func, label))
620
- for page in self._pages:
621
- page.add_exclusion(exclusion_func, label=label)
681
+
682
+ # Apply only to already-created (cached) pages to avoid forcing page creation
683
+ for i in range(len(self._pages)):
684
+ if self._pages._cache[i] is not None: # Only apply to existing pages
685
+ try:
686
+ self._pages._cache[i].add_exclusion(exclusion_func, label=label)
687
+ except Exception as e:
688
+ logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
622
689
  return self
623
690
 
624
691
  # Fallback to original callable / Region behaviour ------------------
625
692
  exclusion_data = (exclusion_func, label)
626
693
  self._exclusions.append(exclusion_data)
627
694
 
628
- for page in self._pages:
629
- page.add_exclusion(exclusion_func, label=label)
695
+ # Apply only to already-created (cached) pages to avoid forcing page creation
696
+ for i in range(len(self._pages)):
697
+ if self._pages._cache[i] is not None: # Only apply to existing pages
698
+ try:
699
+ self._pages._cache[i].add_exclusion(exclusion_func, label=label)
700
+ except Exception as e:
701
+ logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
630
702
 
631
703
  return self
632
704
 
@@ -868,7 +940,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
868
940
  Add a region function to the PDF.
869
941
 
870
942
  Args:
871
- region_func: A function that takes a Page and returns a Region, or None
872
943
  region_func: A function that takes a Page and returns a Region, or None
873
944
  name: Optional name for the region
874
945
 
@@ -881,17 +952,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
881
952
  region_data = (region_func, name)
882
953
  self._regions.append(region_data)
883
954
 
884
- for page in self._pages:
885
- try:
886
- region_instance = region_func(page)
887
- if region_instance and isinstance(region_instance, Region):
888
- page.add_region(region_instance, name=name, source="named")
889
- elif region_instance is not None:
890
- logger.warning(
891
- f"Region function did not return a valid Region for page {page.number}"
892
- )
893
- except Exception as e:
894
- logger.error(f"Error adding region for page {page.number}: {e}")
955
+ # Apply only to already-created (cached) pages to avoid forcing page creation
956
+ for i in range(len(self._pages)):
957
+ if self._pages._cache[i] is not None: # Only apply to existing pages
958
+ page = self._pages._cache[i]
959
+ try:
960
+ region_instance = region_func(page)
961
+ if region_instance and isinstance(region_instance, Region):
962
+ page.add_region(region_instance, name=name, source="named")
963
+ elif region_instance is not None:
964
+ logger.warning(
965
+ f"Region function did not return a valid Region for page {page.number}"
966
+ )
967
+ except Exception as e:
968
+ logger.error(f"Error adding region for page {page.number}: {e}")
895
969
 
896
970
  return self
897
971
 
@@ -1159,6 +1233,62 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1159
1233
 
1160
1234
  return all_tables
1161
1235
 
1236
+ def get_sections(
1237
+ self,
1238
+ start_elements=None,
1239
+ end_elements=None,
1240
+ new_section_on_page_break=False,
1241
+ boundary_inclusion="both",
1242
+ ) -> "ElementCollection":
1243
+ """
1244
+ Extract sections from the entire PDF based on start/end elements.
1245
+
1246
+ This method delegates to the PageCollection.get_sections() method,
1247
+ providing a convenient way to extract document sections across all pages.
1248
+
1249
+ Args:
1250
+ start_elements: Elements or selector string that mark the start of sections (optional)
1251
+ end_elements: Elements or selector string that mark the end of sections (optional)
1252
+ new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
1253
+ boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
1254
+
1255
+ Returns:
1256
+ ElementCollection of Region objects representing the extracted sections
1257
+
1258
+ Example:
1259
+ Extract sections between headers:
1260
+ ```python
1261
+ pdf = npdf.PDF("document.pdf")
1262
+
1263
+ # Get sections between headers
1264
+ sections = pdf.get_sections(
1265
+ start_elements='text[size>14]:bold',
1266
+ end_elements='text[size>14]:bold'
1267
+ )
1268
+
1269
+ # Get sections that break at page boundaries
1270
+ sections = pdf.get_sections(
1271
+ start_elements='text:contains("Chapter")',
1272
+ new_section_on_page_break=True
1273
+ )
1274
+ ```
1275
+
1276
+ Note:
1277
+ You can provide only start_elements, only end_elements, or both.
1278
+ - With only start_elements: sections go from each start to the next start (or end of document)
1279
+ - With only end_elements: sections go from beginning of document to each end
1280
+ - With both: sections go from each start to the corresponding end
1281
+ """
1282
+ if not hasattr(self, "_pages"):
1283
+ raise AttributeError("PDF pages not yet initialized.")
1284
+
1285
+ return self.pages.get_sections(
1286
+ start_elements=start_elements,
1287
+ end_elements=end_elements,
1288
+ new_section_on_page_break=new_section_on_page_break,
1289
+ boundary_inclusion=boundary_inclusion,
1290
+ )
1291
+
1162
1292
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
1163
1293
  """
1164
1294
  DEPRECATED: Use save_pdf(..., ocr=True) instead.
@@ -1633,32 +1763,28 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1633
1763
  logger.error(f"Failed to export correction task: {e}")
1634
1764
  raise
1635
1765
 
1636
- def correct_ocr(
1766
+ def update_text(
1637
1767
  self,
1638
- correction_callback: Callable[[Any], Optional[str]],
1768
+ transform: Callable[[Any], Optional[str]],
1639
1769
  pages: Optional[Union[Iterable[int], range, slice]] = None,
1770
+ selector: str = "text",
1640
1771
  max_workers: Optional[int] = None,
1641
1772
  progress_callback: Optional[Callable[[], None]] = None,
1642
1773
  ) -> "PDF":
1643
1774
  """
1644
- Applies corrections to OCR text elements using a callback function.
1645
- Applies corrections to OCR text elements using a callback function.
1775
+ Applies corrections to text elements using a callback function.
1646
1776
 
1647
1777
  Args:
1648
- correction_callback: Function that takes an element and returns corrected text or None
1649
1778
  correction_callback: Function that takes an element and returns corrected text or None
1650
1779
  pages: Optional page indices/slice to limit the scope of correction
1651
- max_workers: Maximum number of threads to use for parallel execution
1652
- progress_callback: Optional callback function for progress updates
1780
+ selector: Selector to apply corrections to (default: "text")
1653
1781
  max_workers: Maximum number of threads to use for parallel execution
1654
1782
  progress_callback: Optional callback function for progress updates
1655
1783
 
1656
1784
  Returns:
1657
1785
  Self for method chaining
1658
- Self for method chaining
1659
1786
  """
1660
1787
  target_page_indices = []
1661
- target_page_indices = []
1662
1788
  if pages is None:
1663
1789
  target_page_indices = list(range(len(self._pages)))
1664
1790
  elif isinstance(pages, slice):
@@ -1671,32 +1797,29 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1671
1797
  raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
1672
1798
  except (IndexError, TypeError, ValueError) as e:
1673
1799
  raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1674
- raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1675
1800
  else:
1676
1801
  raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1677
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1678
1802
 
1679
1803
  if not target_page_indices:
1680
- logger.warning("No pages selected for OCR correction.")
1804
+ logger.warning("No pages selected for text update.")
1681
1805
  return self
1682
1806
 
1683
- logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1684
- logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1807
+ logger.info(f"Starting text update for pages: {target_page_indices} with selector='{selector}'")
1685
1808
 
1686
1809
  for page_idx in target_page_indices:
1687
1810
  page = self._pages[page_idx]
1688
1811
  try:
1689
- page.correct_ocr(
1690
- correction_callback=correction_callback,
1691
- max_workers=max_workers,
1692
- progress_callback=progress_callback,
1693
- )
1812
+ page.update_text(
1813
+ transform=transform,
1814
+ selector=selector,
1815
+ max_workers=max_workers,
1816
+ progress_callback=progress_callback,
1817
+ )
1694
1818
  except Exception as e:
1695
- logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1696
- logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1819
+ logger.error(f"Error during text update on page {page_idx}: {e}")
1820
+ logger.error(f"Error during text update on page {page_idx}: {e}")
1697
1821
 
1698
- logger.info("OCR correction process finished.")
1699
- logger.info("OCR correction process finished.")
1822
+ logger.info("Text update process finished.")
1700
1823
  return self
1701
1824
 
1702
1825
  def __len__(self) -> int:
@@ -1712,10 +1835,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1712
1835
 
1713
1836
  if isinstance(key, slice):
1714
1837
  from natural_pdf.elements.collections import PageCollection
1715
-
1716
- return PageCollection(self._pages[key])
1717
-
1718
- if isinstance(key, int):
1838
+ # Use the lazy page list's slicing which returns another _LazyPageList
1839
+ lazy_slice = self._pages[key]
1840
+ # Wrap in PageCollection for compatibility
1841
+ return PageCollection(lazy_slice)
1842
+ elif isinstance(key, int):
1719
1843
  if 0 <= key < len(self._pages):
1720
1844
  return self._pages[key]
1721
1845
  else: