PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

docs/categorizing-documents/index.md +168 -0
docs/data-extraction/index.md +87 -0
docs/element-selection/index.ipynb +218 -164
docs/element-selection/index.md +20 -0
docs/index.md +19 -0
docs/ocr/index.md +63 -16
docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
docs/tutorials/02-finding-elements.ipynb +123 -46
docs/tutorials/03-extracting-blocks.ipynb +24 -19
docs/tutorials/04-table-extraction.ipynb +17 -12
docs/tutorials/05-excluding-content.ipynb +37 -32
docs/tutorials/06-document-qa.ipynb +36 -31
docs/tutorials/07-layout-analysis.ipynb +45 -40
docs/tutorials/07-working-with-regions.ipynb +61 -60
docs/tutorials/08-spatial-navigation.ipynb +76 -71
docs/tutorials/09-section-extraction.ipynb +160 -155
docs/tutorials/10-form-field-extraction.ipynb +71 -66
docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
docs/tutorials/12-ocr-integration.ipynb +3420 -312
docs/tutorials/12-ocr-integration.md +68 -106
docs/tutorials/13-semantic-search.ipynb +641 -251
natural_pdf/__init__.py +2 -0
natural_pdf/classification/manager.py +343 -0
natural_pdf/classification/mixin.py +149 -0
natural_pdf/classification/results.py +62 -0
natural_pdf/collections/mixins.py +63 -0
natural_pdf/collections/pdf_collection.py +321 -15
natural_pdf/core/element_manager.py +67 -0
natural_pdf/core/page.py +227 -64
natural_pdf/core/pdf.py +387 -378
natural_pdf/elements/collections.py +272 -41
natural_pdf/elements/region.py +99 -15
natural_pdf/elements/text.py +5 -2
natural_pdf/exporters/paddleocr.py +1 -1
natural_pdf/extraction/manager.py +134 -0
natural_pdf/extraction/mixin.py +246 -0
natural_pdf/extraction/result.py +37 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_manager.py +85 -25
natural_pdf/ocr/ocr_options.py +33 -10
natural_pdf/ocr/utils.py +14 -3
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/selectors/parser.py +363 -238
natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/text_extraction.py +52 -1
natural_pdf/utils/tqdm_utils.py +43 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0

natural_pdf/elements/collections.py CHANGED Viewed

@@ -11,17 +11,22 @@ from typing import (
     Tuple,
     TypeVar,
     Union,
+    Iterable,
 )
 from pdfplumber.utils.geometry import objects_to_bbox
+from tqdm.auto import tqdm
 # New Imports
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
-from natural_pdf.elements.text import TextElement  # Needed for isinstance check
+from natural_pdf.elements.text import TextElement
 from natural_pdf.ocr import OCROptions
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
-from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import the new utility
+from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
+from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.classification.manager import ClassificationManager
+from natural_pdf.collections.mixins import ApplyMixin
 logger = logging.getLogger(__name__)
@@ -33,7 +38,7 @@ T = TypeVar("T")
 P = TypeVar("P", bound="Page")
-class ElementCollection(Generic[T]):
+class ElementCollection(Generic[T], ApplyMixin):
     """
     Collection of PDF elements with batch operations.
     """
@@ -83,12 +88,55 @@ class ElementCollection(Generic[T]):
         """Get the last element in the collection."""
         return self._elements[-1] if self._elements else None
+    def _are_on_multiple_pages(self) -> bool:
+        """
+        Check if elements in this collection span multiple pages.
+        Returns:
+            True if elements are on different pages, False otherwise
+        """
+        if not self._elements:
+            return False
+        # Get the page index of the first element
+        if not hasattr(self._elements[0], "page"):
+            return False
+        first_page_idx = self._elements[0].page.index
+        # Check if any element is on a different page
+        return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
+    def _are_on_multiple_pdfs(self) -> bool:
+        """
+        Check if elements in this collection span multiple PDFs.
+        Returns:
+            True if elements are from different PDFs, False otherwise
+        """
+        if not self._elements:
+            return False
+        # Get the PDF of the first element
+        if not hasattr(self._elements[0], "page") or not hasattr(self._elements[0].page, "pdf"):
+            return False
+        first_pdf = self._elements[0].page.pdf
+        # Check if any element is from a different PDF
+        return any(
+            hasattr(e, "page") and
+            hasattr(e.page, "pdf") and
+            e.page.pdf is not first_pdf
+            for e in self._elements
+        )
     def highest(self) -> Optional["Element"]:
         """
         Get element with the smallest top y-coordinate (highest on page).
         Raises:
-            ValueError: If elements are on multiple pages
+            ValueError: If elements are on multiple pages or multiple PDFs
         Returns:
             Element with smallest top value or None if empty
@@ -96,7 +144,9 @@ class ElementCollection(Generic[T]):
         if not self._elements:
             return None
-        # Check if elements are on multiple pages
+        # Check if elements are on multiple pages or PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError("Cannot determine highest element across multiple PDFs")
         if self._are_on_multiple_pages():
             raise ValueError("Cannot determine highest element across multiple pages")
@@ -107,7 +157,7 @@ class ElementCollection(Generic[T]):
         Get element with the largest bottom y-coordinate (lowest on page).
         Raises:
-            ValueError: If elements are on multiple pages
+            ValueError: If elements are on multiple pages or multiple PDFs
         Returns:
             Element with largest bottom value or None if empty
@@ -115,7 +165,9 @@ class ElementCollection(Generic[T]):
         if not self._elements:
             return None
-        # Check if elements are on multiple pages
+        # Check if elements are on multiple pages or PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError("Cannot determine lowest element across multiple PDFs")
         if self._are_on_multiple_pages():
             raise ValueError("Cannot determine lowest element across multiple pages")
@@ -126,7 +178,7 @@ class ElementCollection(Generic[T]):
         Get element with the smallest x0 coordinate (leftmost on page).
         Raises:
-            ValueError: If elements are on multiple pages
+            ValueError: If elements are on multiple pages or multiple PDFs
         Returns:
             Element with smallest x0 value or None if empty
@@ -134,7 +186,9 @@ class ElementCollection(Generic[T]):
         if not self._elements:
             return None
-        # Check if elements are on multiple pages
+        # Check if elements are on multiple pages or PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError("Cannot determine leftmost element across multiple PDFs")
         if self._are_on_multiple_pages():
             raise ValueError("Cannot determine leftmost element across multiple pages")
@@ -145,7 +199,7 @@ class ElementCollection(Generic[T]):
         Get element with the largest x1 coordinate (rightmost on page).
         Raises:
-            ValueError: If elements are on multiple pages
+            ValueError: If elements are on multiple pages or multiple PDFs
         Returns:
             Element with largest x1 value or None if empty
@@ -153,31 +207,14 @@ class ElementCollection(Generic[T]):
         if not self._elements:
             return None
-        # Check if elements are on multiple pages
+        # Check if elements are on multiple pages or PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError("Cannot determine rightmost element across multiple PDFs")
         if self._are_on_multiple_pages():
             raise ValueError("Cannot determine rightmost element across multiple pages")
         return max(self._elements, key=lambda e: e.x1)
-    def _are_on_multiple_pages(self) -> bool:
-        """
-        Check if elements in this collection span multiple pages.
-        Returns:
-            True if elements are on different pages, False otherwise
-        """
-        if not self._elements:
-            return False
-        # Get the page index of the first element
-        if not hasattr(self._elements[0], "page"):
-            return False
-        first_page_idx = self._elements[0].page.index
-        # Check if any element is on a different page
-        return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
     def exclude_regions(self, regions: List["Region"]) -> "ElementCollection":
         """
         Remove elements that are within any of the specified regions.
@@ -359,6 +396,9 @@ class ElementCollection(Generic[T]):
         Uses grouping logic based on parameters (defaulting to grouping by type).
+        Note: Elements must be from the same PDF for this operation to work properly,
+        as each PDF has its own highlighting service.
         Args:
             label: Optional explicit label for the entire collection. If provided,
                    all elements are highlighted as a single group with this label,
@@ -389,8 +429,12 @@ class ElementCollection(Generic[T]):
             AttributeError: If 'group_by' is provided but the attribute doesn't exist
                             on some elements.
             ValueError: If 'label_format' is provided but contains invalid keys for
-                        element attributes.
+                        element attributes, or if elements span multiple PDFs.
         """
+        # Check if elements span multiple PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError("highlight() does not support elements from multiple PDFs")
         # 1. Prepare the highlight data based on parameters
         highlight_data_list = self._prepare_highlight_data(
             distinct=distinct,
@@ -761,7 +805,8 @@ class ElementCollection(Generic[T]):
         Generates a temporary preview image highlighting elements in this collection
         on their page, ignoring any persistent highlights.
-        Currently only supports collections where all elements are on the same page.
+        Currently only supports collections where all elements are on the same page
+        of the same PDF.
         Allows grouping and coloring elements based on attributes, similar to the
         persistent `highlight()` method, but only for this temporary view.
@@ -780,14 +825,20 @@ class ElementCollection(Generic[T]):
         Returns:
             PIL Image object of the temporary preview, or None if rendering fails or
-            elements span multiple pages.
+            elements span multiple pages/PDFs.
         Raises:
-            ValueError: If the collection is empty or elements are on different pages.
+            ValueError: If the collection is empty or elements are on different pages/PDFs.
         """
         if not self._elements:
             raise ValueError("Cannot show an empty collection.")
+        # Check if elements are on multiple PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError(
+                "show() currently only supports collections where all elements are from the same PDF."
+            )
         # Check if elements are on multiple pages
         if self._are_on_multiple_pages():
             raise ValueError(
@@ -1122,10 +1173,12 @@ class ElementCollection(Generic[T]):
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
+        max_workers: Optional[int] = None,
     ) -> "ElementCollection":
         """
         Applies corrections to OCR-generated text elements within this collection
-        using a user-provided callback function.
+        using a user-provided callback function, executed
+        in parallel if `max_workers` is specified.
         Iterates through elements currently in the collection. If an element's
         'source' attribute starts with 'ocr', it calls the `correction_callback`
@@ -1143,6 +1196,8 @@ class ElementCollection(Generic[T]):
         Args:
             correction_callback: A function accepting an element and returning
                                  `Optional[str]` (new text or None).
+            max_workers: The maximum number of worker threads to use for parallel
+                         correction on each page. If None, defaults are used.
         Returns:
             Self for method chaining.
@@ -1152,11 +1207,169 @@ class ElementCollection(Generic[T]):
             elements=self._elements,
             correction_callback=correction_callback,
             caller_info=f"ElementCollection(len={len(self._elements)})",  # Pass caller info
+            max_workers=max_workers,
         )
         return self  # Return self for chaining
+    def remove(self) -> int:
+        """
+        Remove all elements in this collection from their respective pages.
+        This method removes elements from the page's _element_mgr storage.
+        It's particularly useful for removing OCR elements before applying new OCR.
+        Returns:
+            int: Number of elements successfully removed
+        """
+        if not self._elements:
+            return 0
+        removed_count = 0
+        for element in self._elements:
+            # Each element should have a reference to its page
+            if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
+                element_mgr = element.page._element_mgr
+                # Determine element type
+                element_type = getattr(element, "object_type", None)
+                if element_type:
+                    # Convert to plural form expected by element_mgr
+                    if element_type == "word":
+                        element_type = "words"
+                    elif element_type == "char":
+                        element_type = "chars"
+                    elif element_type == "rect":
+                        element_type = "rects"
+                    elif element_type == "line":
+                        element_type = "lines"
+                    # Try to remove from the element manager
+                    if hasattr(element_mgr, "remove_element"):
+                        success = element_mgr.remove_element(element, element_type)
+                        if success:
+                            removed_count += 1
+                    else:
+                        logger.warning("ElementManager does not have remove_element method")
+            else:
+                logger.warning(f"Element has no page or page has no _element_mgr: {element}")
+        return removed_count
+    # --- Classification Method --- #
+    def classify_all(
+        self,
+        categories: List[str],
+        model: Optional[str] = None,
+        using: Optional[str] = None,
+        min_confidence: float = 0.0,
+        analysis_key: str = 'classification',
+        multi_label: bool = False,
+        batch_size: int = 8,
+        max_workers: Optional[int] = None,
+        progress_bar: bool = True,
+        **kwargs
+    ):
+        """Classifies all elements in the collection in batch.
+        Args:
+            categories: List of category labels.
+            model: Model ID (or alias 'text', 'vision').
+            using: Optional processing mode ('text' or 'vision'). Inferred if None.
+            min_confidence: Minimum confidence threshold.
+            analysis_key: Key for storing results in element.analyses.
+            multi_label: Allow multiple labels per item.
+            batch_size: Size of batches passed to the inference pipeline.
+            max_workers: (Not currently used for classification batching which is
+                         handled by the underlying pipeline).
+            progress_bar: Display a progress bar.
+            **kwargs: Additional arguments for the ClassificationManager.
+        """
+        if not self.elements:
+            logger.info("ElementCollection is empty, skipping classification.")
+            return self
+        # Requires access to the PDF's manager. Assume first element has it.
+        first_element = self.elements[0]
+        manager_source = None
+        if hasattr(first_element, 'page') and hasattr(first_element.page, 'pdf'):
+             manager_source = first_element.page.pdf
+        elif hasattr(first_element, 'pdf'): # Maybe it's a PageCollection?
+             manager_source = first_element.pdf
+        if not manager_source or not hasattr(manager_source, 'get_manager'):
+             raise RuntimeError("Cannot access ClassificationManager via elements.")
+        try:
+            manager = manager_source.get_manager('classification')
+        except Exception as e:
+             raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
+        if not manager or not manager.is_available():
+             raise RuntimeError("ClassificationManager is not available.")
+        # Determine engine type early for content gathering
+        inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
+        # Gather content from all elements
+        items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
+        original_elements: List[Any] = []
+        logger.info(f"Gathering content for {len(self.elements)} elements for batch classification...")
+        for element in self.elements:
+             if not isinstance(element, ClassificationMixin):
+                 logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
+                 continue
+             try:
+                 # Delegate content fetching to the element itself
+                 content = element._get_classification_content(model_type=inferred_using, **kwargs)
+                 items_to_classify.append(content)
+                 original_elements.append(element)
+             except (ValueError, NotImplementedError) as e:
+                 logger.warning(f"Skipping element {element!r}: Cannot get content for classification - {e}")
+             except Exception as e:
+                  logger.warning(f"Skipping element {element!r}: Error getting classification content - {e}")
+        if not items_to_classify:
+             logger.warning("No content could be gathered from elements for batch classification.")
+             return self
+        logger.info(f"Collected content for {len(items_to_classify)} elements. Running batch classification...")
+        # Call manager's batch classify
+        batch_results: List[ClassificationResult] = manager.classify_batch(
+            item_contents=items_to_classify,
+            categories=categories,
+            model_id=model,
+            using=inferred_using,
+            min_confidence=min_confidence,
+            multi_label=multi_label,
+            batch_size=batch_size,
+            progress_bar=progress_bar,
+            **kwargs
+        )
+        # Assign results back to elements
+        if len(batch_results) != len(original_elements):
+             logger.error(
+                 f"Batch classification result count ({len(batch_results)}) mismatch "
+                 f"with elements processed ({len(original_elements)}). Cannot assign results."
+             )
+             # Decide how to handle mismatch - maybe store errors?
+        else:
+             logger.info(f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'.")
+             for element, result_obj in zip(original_elements, batch_results):
+                 try:
+                     if not hasattr(element, 'analyses') or element.analyses is None:
+                          element.analyses = {}
+                     element.analyses[analysis_key] = result_obj
+                 except Exception as e:
+                      logger.warning(f"Failed to store classification result for {element!r}: {e}")
+        return self
+    # --- End Classification Method --- #
-class PageCollection(Generic[P]):
+class PageCollection(Generic[P], ApplyMixin):
     """
     A collection of PDF pages with cross-page operations.
@@ -1221,6 +1434,7 @@ class PageCollection(Generic[P]):
         device: Optional[str] = None,
         resolution: Optional[int] = None,  # DPI for rendering
         apply_exclusions: bool = True,  # New parameter
+        replace: bool = True,  # Whether to replace existing OCR elements
         # --- Engine-Specific Options ---
         options: Optional[Any] = None,  # e.g., EasyOCROptions(...)
     ) -> "PageCollection[P]":
@@ -1240,6 +1454,8 @@ class PageCollection(Generic[P]):
             apply_exclusions: If True (default), render page images for OCR with
                               excluded areas masked (whited out). If False, OCR
                               the raw page images without masking exclusions.
+            replace: If True (default), remove any existing OCR elements before
+                    adding new ones. If False, add new OCR elements to existing ones.
             options: An engine-specific options object (e.g., EasyOCROptions) or dict.
         Returns:
@@ -1277,6 +1493,7 @@ class PageCollection(Generic[P]):
             device=device,
             resolution=resolution,
             apply_exclusions=apply_exclusions,  # Pass down
+            replace=replace,  # Pass the replace parameter
             options=options,
         )
         # The PDF method modifies the Page objects directly by adding elements.
@@ -1324,10 +1541,12 @@ class PageCollection(Generic[P]):
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
+        max_workers: Optional[int] = None,
     ) -> "PageCollection[P]":
         """
         Applies corrections to OCR-generated text elements across all pages
-        in this collection using a user-provided callback function.
+        in this collection using a user-provided callback function, executed
+        in parallel if `max_workers` is specified.
         This method delegates to the parent PDF's `correct_ocr` method,
         targeting all pages within this collection.
@@ -1335,10 +1554,11 @@ class PageCollection(Generic[P]):
         Args:
             correction_callback: A function that accepts a single argument (an element
                                  object) and returns `Optional[str]` (new text or None).
+            max_workers: The maximum number of worker threads to use for parallel
+                         correction on each page. If None, defaults are used.
         Returns:
-            A dictionary containing aggregate statistics for the process across all pages:
-            {'elements_checked': total_checked, 'corrections_applied': total_applied}
+            Self for method chaining.
         Raises:
             RuntimeError: If the collection is empty, pages lack a parent PDF reference,
@@ -1346,17 +1566,28 @@ class PageCollection(Generic[P]):
         """
         if not self.pages:
             logger.warning("Cannot correct OCR for an empty PageCollection.")
+            # Return self even if empty to maintain chaining consistency
+            return self
         # Assume all pages share the same parent PDF object
         parent_pdf = self.pages[0]._parent
+        if not parent_pdf or not hasattr(parent_pdf, 'correct_ocr') or not callable(parent_pdf.correct_ocr):
+             raise RuntimeError(
+                 "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
+             )
         page_indices = [p.index for p in self.pages]
         logger.info(
-            f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
+            f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
         )
         # Delegate the call to the parent PDF object for the relevant pages
-        parent_pdf.correct_ocr(correction_callback=correction_callback, pages=page_indices)
+        # Pass the max_workers parameter down
+        parent_pdf.correct_ocr(
+            correction_callback=correction_callback,
+            pages=page_indices,
+            max_workers=max_workers # Pass it here
+        )
         return self

natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl