PyPI - natural-pdf - Versions diffs - 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl - Mend

natural-pdf 0.1.38py3-none-any.whl → 0.1.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

natural_pdf/__init__.py +6 -0
natural_pdf/core/page.py +21 -21
natural_pdf/core/pdf.py +77 -24
natural_pdf/elements/collections.py +164 -40
natural_pdf/elements/region.py +90 -40
natural_pdf/flows/element.py +25 -0
natural_pdf/flows/flow.py +702 -20
natural_pdf/flows/region.py +52 -4
natural_pdf/selectors/parser.py +34 -1
natural_pdf/text_mixin.py +97 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0

natural_pdf/__init__.py CHANGED Viewed

@@ -76,6 +76,9 @@ from natural_pdf.core.page import Page
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.collections import ElementCollection
 from natural_pdf.elements.region import Region
+from natural_pdf.flows.flow import Flow
+from natural_pdf.flows.region import FlowRegion
+from natural_pdf.analyzers.guides import Guides
 ElementCollection = None
@@ -116,6 +119,9 @@ __all__ = [
     "Page",
     "Region",
     "ElementCollection",
+    "Flow",
+    "FlowRegion",
+    "Guides",
     "TextSearchOptions",
     "MultiModalSearchOptions",
     "BaseSearchOptions",

natural_pdf/core/page.py CHANGED Viewed

@@ -64,7 +64,6 @@ from natural_pdf.core.element_manager import ElementManager
 from natural_pdf.describe.mixin import DescribeMixin  # Import describe mixin
 from natural_pdf.elements.base import Element  # Import base element
 from natural_pdf.elements.text import TextElement
-from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.qa import DocumentQA, get_qa_engine
@@ -76,8 +75,9 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
 # --- End Classification Imports --- #
-# --- End Shape Detection Mixin --- #
+# --- Text update mixin import --- #
+from natural_pdf.text_mixin import TextMixin
+from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
 try:
@@ -92,7 +92,7 @@ except ImportError:
 logger = logging.getLogger(__name__)
-class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
+class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
     """Enhanced Page wrapper built on top of pdfplumber.Page.
     This class provides a fluent interface for working with PDF pages,
@@ -2886,25 +2886,25 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         logger.info(f"Searchable PDF saved to: {output_path_str}")
     # --- Added correct_ocr method ---
-    def correct_ocr(
+    def update_text(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
-        selector: Optional[str] = "text[source=ocr]",
+        transform: Callable[[Any], Optional[str]],
+        selector: str = "text",
         max_workers: Optional[int] = None,
         progress_callback: Optional[Callable[[], None]] = None,  # Added progress callback
     ) -> "Page":  # Return self for chaining
         """
-        Applies corrections to OCR-generated text elements on this page
+        Applies corrections to text elements on this page
         using a user-provided callback function, potentially in parallel.
-        Finds text elements on this page whose 'source' attribute starts
-        with 'ocr' and calls the `correction_callback` for each, passing the
-        element itself. Updates the element's text if the callback returns
-        a new string.
+        Finds text elements on this page matching the *selector* argument and
+        calls the ``transform`` for each, passing the element itself.
+        Updates the element's text if the callback returns a new string.
         Args:
-            correction_callback: A function accepting an element and returning
-                                 `Optional[str]` (new text or None).
+            transform: A function accepting an element and returning
+                       `Optional[str]` (new text or None).
+            selector: CSS-like selector string to match text elements.
             max_workers: The maximum number of threads to use for parallel execution.
                          If None or 0 or 1, runs sequentially.
             progress_callback: Optional callback function to call after processing each element.
@@ -2913,21 +2913,21 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             Self for method chaining.
         """
         logger.info(
-            f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
+            f"Page {self.number}: Starting text update with callback '{transform.__name__}' (max_workers={max_workers}) and selector='{selector}'"
         )
         target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
         target_elements = target_elements_collection.elements  # Get the list
         if not target_elements:
-            logger.info(f"Page {self.number}: No OCR elements found to correct.")
+            logger.info(f"Page {self.number}: No text elements found to update.")
             return self
         element_pbar = None
         try:
             element_pbar = tqdm(
                 total=len(target_elements),
-                desc=f"Correcting OCR Page {self.number}",
+                desc=f"Updating text Page {self.number}",
                 unit="element",
                 leave=False,
             )
@@ -2941,7 +2941,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
                 try:
                     current_text = getattr(element, "text", None)
                     # Call the user-provided callback
-                    corrected_text = correction_callback(element)
+                    corrected_text = transform(element)
                     # Validate result type
                     if corrected_text is not None and not isinstance(corrected_text, str):
@@ -2976,7 +2976,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             if max_workers is not None and max_workers > 1:
                 # --- Parallel execution --- #
                 logger.info(
-                    f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
+                    f"Page {self.number}: Running text update in parallel with {max_workers} workers."
                 )
                 futures = []
                 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -3012,7 +3012,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             else:
                 # --- Sequential execution --- #
-                logger.info(f"Page {self.number}: Running OCR correction sequentially.")
+                logger.info(f"Page {self.number}: Running text update sequentially.")
                 for element in target_elements:
                     # Call the task function directly (it handles progress_callback)
                     processed_count += 1
@@ -3027,7 +3027,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
                             updated_count += 1
             logger.info(
-                f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
+                f"Page {self.number}: Text update finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
             )
             return self  # Return self for chaining

natural_pdf/core/pdf.py CHANGED Viewed

@@ -39,6 +39,10 @@ from natural_pdf.extraction.mixin import ExtractionMixin
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.selectors.parser import parse_selector
 from natural_pdf.utils.locks import pdf_render_lock
+from natural_pdf.text_mixin import TextMixin
+if TYPE_CHECKING:
+    from natural_pdf.elements.collections import ElementCollection
 try:
     from typing import Any as TypingAny
@@ -247,7 +251,7 @@ class _LazyPageList(Sequence):
 # --- End Lazy Page List Helper --- #
-class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
+class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
     """Enhanced PDF wrapper built on top of pdfplumber.
     This class provides a fluent interface for working with PDF documents,
@@ -1229,6 +1233,62 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         return all_tables
+    def get_sections(
+        self,
+        start_elements=None,
+        end_elements=None,
+        new_section_on_page_break=False,
+        boundary_inclusion="both",
+    ) -> "ElementCollection":
+        """
+        Extract sections from the entire PDF based on start/end elements.
+        This method delegates to the PageCollection.get_sections() method,
+        providing a convenient way to extract document sections across all pages.
+        Args:
+            start_elements: Elements or selector string that mark the start of sections (optional)
+            end_elements: Elements or selector string that mark the end of sections (optional)
+            new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
+            boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
+        Returns:
+            ElementCollection of Region objects representing the extracted sections
+        Example:
+            Extract sections between headers:
+            ```python
+            pdf = npdf.PDF("document.pdf")
+            # Get sections between headers
+            sections = pdf.get_sections(
+                start_elements='text[size>14]:bold',
+                end_elements='text[size>14]:bold'
+            )
+            # Get sections that break at page boundaries
+            sections = pdf.get_sections(
+                start_elements='text:contains("Chapter")',
+                new_section_on_page_break=True
+            )
+            ```
+        Note:
+            You can provide only start_elements, only end_elements, or both.
+            - With only start_elements: sections go from each start to the next start (or end of document)
+            - With only end_elements: sections go from beginning of document to each end
+            - With both: sections go from each start to the corresponding end
+        """
+        if not hasattr(self, "_pages"):
+            raise AttributeError("PDF pages not yet initialized.")
+        return self.pages.get_sections(
+            start_elements=start_elements,
+            end_elements=end_elements,
+            new_section_on_page_break=new_section_on_page_break,
+            boundary_inclusion=boundary_inclusion,
+        )
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
         """
         DEPRECATED: Use save_pdf(..., ocr=True) instead.
@@ -1703,32 +1763,28 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             logger.error(f"Failed to export correction task: {e}")
             raise
-    def correct_ocr(
+    def update_text(
         self,
-        correction_callback: Callable[[Any], Optional[str]],
+        transform: Callable[[Any], Optional[str]],
         pages: Optional[Union[Iterable[int], range, slice]] = None,
+        selector: str = "text",
         max_workers: Optional[int] = None,
         progress_callback: Optional[Callable[[], None]] = None,
     ) -> "PDF":
         """
-        Applies corrections to OCR text elements using a callback function.
-        Applies corrections to OCR text elements using a callback function.
+        Applies corrections to text elements using a callback function.
         Args:
-            correction_callback: Function that takes an element and returns corrected text or None
             correction_callback: Function that takes an element and returns corrected text or None
             pages: Optional page indices/slice to limit the scope of correction
-            max_workers: Maximum number of threads to use for parallel execution
-            progress_callback: Optional callback function for progress updates
+            selector: Selector to apply corrections to (default: "text")
             max_workers: Maximum number of threads to use for parallel execution
             progress_callback: Optional callback function for progress updates
         Returns:
             Self for method chaining
-            Self for method chaining
         """
         target_page_indices = []
-        target_page_indices = []
         if pages is None:
             target_page_indices = list(range(len(self._pages)))
         elif isinstance(pages, slice):
@@ -1741,32 +1797,29 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
                         raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
             except (IndexError, TypeError, ValueError) as e:
                 raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
-                raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
         else:
             raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
-            raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         if not target_page_indices:
-            logger.warning("No pages selected for OCR correction.")
+            logger.warning("No pages selected for text update.")
             return self
-        logger.info(f"Starting OCR correction for pages: {target_page_indices}")
-        logger.info(f"Starting OCR correction for pages: {target_page_indices}")
+        logger.info(f"Starting text update for pages: {target_page_indices} with selector='{selector}'")
         for page_idx in target_page_indices:
             page = self._pages[page_idx]
             try:
-                page.correct_ocr(
-                    correction_callback=correction_callback,
-                    max_workers=max_workers,
-                    progress_callback=progress_callback,
-                )
+                            page.update_text(
+                transform=transform,
+                selector=selector,
+                max_workers=max_workers,
+                progress_callback=progress_callback,
+            )
             except Exception as e:
-                logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
-                logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
+                logger.error(f"Error during text update on page {page_idx}: {e}")
+                logger.error(f"Error during text update on page {page_idx}: {e}")
-        logger.info("OCR correction process finished.")
-        logger.info("OCR correction process finished.")
+        logger.info("Text update process finished.")
         return self
     def __len__(self) -> int:

natural-pdf 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl

natural-pdf 0.1.38py3-none-any.whl → 0.1.40py3-none-any.whl