natural-pdf 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/__init__.py CHANGED
@@ -76,6 +76,9 @@ from natural_pdf.core.page import Page
76
76
  from natural_pdf.core.pdf import PDF
77
77
  from natural_pdf.elements.collections import ElementCollection
78
78
  from natural_pdf.elements.region import Region
79
+ from natural_pdf.flows.flow import Flow
80
+ from natural_pdf.flows.region import FlowRegion
81
+ from natural_pdf.analyzers.guides import Guides
79
82
 
80
83
  ElementCollection = None
81
84
 
@@ -116,6 +119,9 @@ __all__ = [
116
119
  "Page",
117
120
  "Region",
118
121
  "ElementCollection",
122
+ "Flow",
123
+ "FlowRegion",
124
+ "Guides",
119
125
  "TextSearchOptions",
120
126
  "MultiModalSearchOptions",
121
127
  "BaseSearchOptions",
natural_pdf/core/page.py CHANGED
@@ -64,7 +64,6 @@ from natural_pdf.core.element_manager import ElementManager
64
64
  from natural_pdf.describe.mixin import DescribeMixin # Import describe mixin
65
65
  from natural_pdf.elements.base import Element # Import base element
66
66
  from natural_pdf.elements.text import TextElement
67
- from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
68
67
  from natural_pdf.ocr import OCRManager, OCROptions
69
68
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
70
69
  from natural_pdf.qa import DocumentQA, get_qa_engine
@@ -76,8 +75,9 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
76
75
 
77
76
  # --- End Classification Imports --- #
78
77
 
79
-
80
- # --- End Shape Detection Mixin --- #
78
+ # --- Text update mixin import --- #
79
+ from natural_pdf.text_mixin import TextMixin
80
+ from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
81
81
 
82
82
 
83
83
  try:
@@ -92,7 +92,7 @@ except ImportError:
92
92
  logger = logging.getLogger(__name__)
93
93
 
94
94
 
95
- class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
95
+ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
96
96
  """Enhanced Page wrapper built on top of pdfplumber.Page.
97
97
 
98
98
  This class provides a fluent interface for working with PDF pages,
@@ -2886,25 +2886,25 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2886
2886
  logger.info(f"Searchable PDF saved to: {output_path_str}")
2887
2887
 
2888
2888
  # --- Added correct_ocr method ---
2889
- def correct_ocr(
2889
+ def update_text(
2890
2890
  self,
2891
- correction_callback: Callable[[Any], Optional[str]],
2892
- selector: Optional[str] = "text[source=ocr]",
2891
+ transform: Callable[[Any], Optional[str]],
2892
+ selector: str = "text",
2893
2893
  max_workers: Optional[int] = None,
2894
2894
  progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
2895
2895
  ) -> "Page": # Return self for chaining
2896
2896
  """
2897
- Applies corrections to OCR-generated text elements on this page
2897
+ Applies corrections to text elements on this page
2898
2898
  using a user-provided callback function, potentially in parallel.
2899
2899
 
2900
- Finds text elements on this page whose 'source' attribute starts
2901
- with 'ocr' and calls the `correction_callback` for each, passing the
2902
- element itself. Updates the element's text if the callback returns
2903
- a new string.
2900
+ Finds text elements on this page matching the *selector* argument and
2901
+ calls the ``transform`` for each, passing the element itself.
2902
+ Updates the element's text if the callback returns a new string.
2904
2903
 
2905
2904
  Args:
2906
- correction_callback: A function accepting an element and returning
2907
- `Optional[str]` (new text or None).
2905
+ transform: A function accepting an element and returning
2906
+ `Optional[str]` (new text or None).
2907
+ selector: CSS-like selector string to match text elements.
2908
2908
  max_workers: The maximum number of threads to use for parallel execution.
2909
2909
  If None or 0 or 1, runs sequentially.
2910
2910
  progress_callback: Optional callback function to call after processing each element.
@@ -2913,21 +2913,21 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2913
2913
  Self for method chaining.
2914
2914
  """
2915
2915
  logger.info(
2916
- f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
2916
+ f"Page {self.number}: Starting text update with callback '{transform.__name__}' (max_workers={max_workers}) and selector='{selector}'"
2917
2917
  )
2918
2918
 
2919
2919
  target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
2920
2920
  target_elements = target_elements_collection.elements # Get the list
2921
2921
 
2922
2922
  if not target_elements:
2923
- logger.info(f"Page {self.number}: No OCR elements found to correct.")
2923
+ logger.info(f"Page {self.number}: No text elements found to update.")
2924
2924
  return self
2925
2925
 
2926
2926
  element_pbar = None
2927
2927
  try:
2928
2928
  element_pbar = tqdm(
2929
2929
  total=len(target_elements),
2930
- desc=f"Correcting OCR Page {self.number}",
2930
+ desc=f"Updating text Page {self.number}",
2931
2931
  unit="element",
2932
2932
  leave=False,
2933
2933
  )
@@ -2941,7 +2941,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2941
2941
  try:
2942
2942
  current_text = getattr(element, "text", None)
2943
2943
  # Call the user-provided callback
2944
- corrected_text = correction_callback(element)
2944
+ corrected_text = transform(element)
2945
2945
 
2946
2946
  # Validate result type
2947
2947
  if corrected_text is not None and not isinstance(corrected_text, str):
@@ -2976,7 +2976,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2976
2976
  if max_workers is not None and max_workers > 1:
2977
2977
  # --- Parallel execution --- #
2978
2978
  logger.info(
2979
- f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
2979
+ f"Page {self.number}: Running text update in parallel with {max_workers} workers."
2980
2980
  )
2981
2981
  futures = []
2982
2982
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -3012,7 +3012,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
3012
3012
 
3013
3013
  else:
3014
3014
  # --- Sequential execution --- #
3015
- logger.info(f"Page {self.number}: Running OCR correction sequentially.")
3015
+ logger.info(f"Page {self.number}: Running text update sequentially.")
3016
3016
  for element in target_elements:
3017
3017
  # Call the task function directly (it handles progress_callback)
3018
3018
  processed_count += 1
@@ -3027,7 +3027,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
3027
3027
  updated_count += 1
3028
3028
 
3029
3029
  logger.info(
3030
- f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
3030
+ f"Page {self.number}: Text update finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
3031
3031
  )
3032
3032
 
3033
3033
  return self # Return self for chaining
natural_pdf/core/pdf.py CHANGED
@@ -39,6 +39,10 @@ from natural_pdf.extraction.mixin import ExtractionMixin
39
39
  from natural_pdf.ocr import OCRManager, OCROptions
40
40
  from natural_pdf.selectors.parser import parse_selector
41
41
  from natural_pdf.utils.locks import pdf_render_lock
42
+ from natural_pdf.text_mixin import TextMixin
43
+
44
+ if TYPE_CHECKING:
45
+ from natural_pdf.elements.collections import ElementCollection
42
46
 
43
47
  try:
44
48
  from typing import Any as TypingAny
@@ -247,7 +251,7 @@ class _LazyPageList(Sequence):
247
251
  # --- End Lazy Page List Helper --- #
248
252
 
249
253
 
250
- class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
254
+ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
251
255
  """Enhanced PDF wrapper built on top of pdfplumber.
252
256
 
253
257
  This class provides a fluent interface for working with PDF documents,
@@ -1229,6 +1233,62 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1229
1233
 
1230
1234
  return all_tables
1231
1235
 
1236
+ def get_sections(
1237
+ self,
1238
+ start_elements=None,
1239
+ end_elements=None,
1240
+ new_section_on_page_break=False,
1241
+ boundary_inclusion="both",
1242
+ ) -> "ElementCollection":
1243
+ """
1244
+ Extract sections from the entire PDF based on start/end elements.
1245
+
1246
+ This method delegates to the PageCollection.get_sections() method,
1247
+ providing a convenient way to extract document sections across all pages.
1248
+
1249
+ Args:
1250
+ start_elements: Elements or selector string that mark the start of sections (optional)
1251
+ end_elements: Elements or selector string that mark the end of sections (optional)
1252
+ new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
1253
+ boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
1254
+
1255
+ Returns:
1256
+ ElementCollection of Region objects representing the extracted sections
1257
+
1258
+ Example:
1259
+ Extract sections between headers:
1260
+ ```python
1261
+ pdf = npdf.PDF("document.pdf")
1262
+
1263
+ # Get sections between headers
1264
+ sections = pdf.get_sections(
1265
+ start_elements='text[size>14]:bold',
1266
+ end_elements='text[size>14]:bold'
1267
+ )
1268
+
1269
+ # Get sections that break at page boundaries
1270
+ sections = pdf.get_sections(
1271
+ start_elements='text:contains("Chapter")',
1272
+ new_section_on_page_break=True
1273
+ )
1274
+ ```
1275
+
1276
+ Note:
1277
+ You can provide only start_elements, only end_elements, or both.
1278
+ - With only start_elements: sections go from each start to the next start (or end of document)
1279
+ - With only end_elements: sections go from beginning of document to each end
1280
+ - With both: sections go from each start to the corresponding end
1281
+ """
1282
+ if not hasattr(self, "_pages"):
1283
+ raise AttributeError("PDF pages not yet initialized.")
1284
+
1285
+ return self.pages.get_sections(
1286
+ start_elements=start_elements,
1287
+ end_elements=end_elements,
1288
+ new_section_on_page_break=new_section_on_page_break,
1289
+ boundary_inclusion=boundary_inclusion,
1290
+ )
1291
+
1232
1292
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
1233
1293
  """
1234
1294
  DEPRECATED: Use save_pdf(..., ocr=True) instead.
@@ -1703,32 +1763,28 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1703
1763
  logger.error(f"Failed to export correction task: {e}")
1704
1764
  raise
1705
1765
 
1706
- def correct_ocr(
1766
+ def update_text(
1707
1767
  self,
1708
- correction_callback: Callable[[Any], Optional[str]],
1768
+ transform: Callable[[Any], Optional[str]],
1709
1769
  pages: Optional[Union[Iterable[int], range, slice]] = None,
1770
+ selector: str = "text",
1710
1771
  max_workers: Optional[int] = None,
1711
1772
  progress_callback: Optional[Callable[[], None]] = None,
1712
1773
  ) -> "PDF":
1713
1774
  """
1714
- Applies corrections to OCR text elements using a callback function.
1715
- Applies corrections to OCR text elements using a callback function.
1775
+ Applies corrections to text elements using a callback function.
1716
1776
 
1717
1777
  Args:
1718
- correction_callback: Function that takes an element and returns corrected text or None
1719
1778
  correction_callback: Function that takes an element and returns corrected text or None
1720
1779
  pages: Optional page indices/slice to limit the scope of correction
1721
- max_workers: Maximum number of threads to use for parallel execution
1722
- progress_callback: Optional callback function for progress updates
1780
+ selector: Selector to apply corrections to (default: "text")
1723
1781
  max_workers: Maximum number of threads to use for parallel execution
1724
1782
  progress_callback: Optional callback function for progress updates
1725
1783
 
1726
1784
  Returns:
1727
1785
  Self for method chaining
1728
- Self for method chaining
1729
1786
  """
1730
1787
  target_page_indices = []
1731
- target_page_indices = []
1732
1788
  if pages is None:
1733
1789
  target_page_indices = list(range(len(self._pages)))
1734
1790
  elif isinstance(pages, slice):
@@ -1741,32 +1797,29 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1741
1797
  raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
1742
1798
  except (IndexError, TypeError, ValueError) as e:
1743
1799
  raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1744
- raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1745
1800
  else:
1746
1801
  raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1747
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1748
1802
 
1749
1803
  if not target_page_indices:
1750
- logger.warning("No pages selected for OCR correction.")
1804
+ logger.warning("No pages selected for text update.")
1751
1805
  return self
1752
1806
 
1753
- logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1754
- logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1807
+ logger.info(f"Starting text update for pages: {target_page_indices} with selector='{selector}'")
1755
1808
 
1756
1809
  for page_idx in target_page_indices:
1757
1810
  page = self._pages[page_idx]
1758
1811
  try:
1759
- page.correct_ocr(
1760
- correction_callback=correction_callback,
1761
- max_workers=max_workers,
1762
- progress_callback=progress_callback,
1763
- )
1812
+ page.update_text(
1813
+ transform=transform,
1814
+ selector=selector,
1815
+ max_workers=max_workers,
1816
+ progress_callback=progress_callback,
1817
+ )
1764
1818
  except Exception as e:
1765
- logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1766
- logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1819
+ logger.error(f"Error during text update on page {page_idx}: {e}")
1820
+ logger.error(f"Error during text update on page {page_idx}: {e}")
1767
1821
 
1768
- logger.info("OCR correction process finished.")
1769
- logger.info("OCR correction process finished.")
1822
+ logger.info("Text update process finished.")
1770
1823
  return self
1771
1824
 
1772
1825
  def __len__(self) -> int: