PyPI - natural-pdf - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +230 -151
natural_pdf/classification/mixin.py +49 -35
natural_pdf/classification/results.py +64 -46
natural_pdf/collections/mixins.py +68 -20
natural_pdf/collections/pdf_collection.py +177 -64
natural_pdf/core/element_manager.py +30 -14
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +423 -101
natural_pdf/core/pdf.py +633 -190
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +503 -131
natural_pdf/elements/region.py +659 -90
natural_pdf/elements/text.py +1 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +4 -3
natural_pdf/extraction/manager.py +50 -49
natural_pdf/extraction/mixin.py +90 -57
natural_pdf/extraction/result.py +9 -23
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +61 -25
natural_pdf/ocr/ocr_options.py +70 -10
natural_pdf/ocr/utils.py +6 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +219 -143
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +1 -1
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +24 -16
natural_pdf/utils/tqdm_utils.py +18 -10
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/categorizing-documents/index.md +0 -168
docs/data-extraction/index.md +0 -87
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -969
docs/element-selection/index.md +0 -249
docs/finetuning/index.md +0 -176
docs/index.md +0 -189
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -256
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -417
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -152
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -119
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -275
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -337
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -293
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -414
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -513
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2439
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -517
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -3712
docs/tutorials/12-ocr-integration.md +0 -137
docs/tutorials/13-semantic-search.ipynb +0 -1718
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.8.dist-info/RECORD +0 -156
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/collections/pdf_collection.py CHANGED Viewed

@@ -1,13 +1,28 @@
+import concurrent.futures  # Import concurrent.futures
 import copy  # Added for copying options
 import glob as py_glob
 import logging
 import os
 import re  # Added for safe path generation
+import threading  # Import threading for logging thread information
+import time  # Import time for logging timestamps
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union, Callable
-import concurrent.futures # Import concurrent.futures
-import time # Import time for logging timestamps
-import threading # Import threading for logging thread information
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Type,
+    TypeVar,
+    Union,
+    overload,
+)
 from PIL import Image
 from tqdm import tqdm
@@ -26,6 +41,7 @@ logger = logging.getLogger(__name__)
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.region import Region
+from natural_pdf.export.mixin import ExportMixin
 # --- Search Imports ---
 try:
@@ -47,12 +63,12 @@ except ImportError as e:
     SearchServiceProtocol, SearchOptions, Indexable = object, object, object
-from natural_pdf.search.searchable_mixin import SearchableMixin  # Import the new mixin
 # Import the ApplyMixin
 from natural_pdf.collections.mixins import ApplyMixin
+from natural_pdf.search.searchable_mixin import SearchableMixin  # Import the new mixin
-class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
+class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixin
     def __init__(
         self,
         source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -252,54 +268,83 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
     def __repr__(self) -> str:
         # Removed search status
         return f"<PDFCollection(count={len(self._pdfs)})>"
+        return f"<PDFCollection(count={len(self._pdfs)})>"
     @property
     def pdfs(self) -> List["PDF"]:
         """Returns the list of PDF objects held by the collection."""
         return self._pdfs
+    @overload
+    def find_all(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    @overload
+    def find_all(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
     def find_all(
-        self,
-        selector: str,
-        apply_exclusions: bool = True,  # Added explicit parameter
-        regex: bool = False,            # Added explicit parameter
-        case: bool = True,             # Added explicit parameter
-        **kwargs
+        self,
+        selector: Optional[str] = None,  # Now optional
+        *,
+        text: Optional[str] = None,  # New text parameter
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
     ) -> "ElementCollection":
         """
-        Find all elements matching the selector across all PDFs in the collection.
+        Find all elements matching the selector OR text across all PDFs in the collection.
+        Provide EITHER `selector` OR `text`, but not both.
         This creates an ElementCollection that can span multiple PDFs. Note that
         some ElementCollection methods have limitations when spanning PDFs.
         Args:
-            selector: CSS-like selector string to query elements
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            regex: Whether to use regex for text search in :contains (default: False)
-            case: Whether to do case-sensitive text search (default: True)
-            **kwargs: Additional keyword arguments passed to the find_all method of each PDF
+            selector: CSS-like selector string to query elements.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional keyword arguments passed to the find_all method of each PDF.
         Returns:
-            ElementCollection containing all matching elements across all PDFs
+            ElementCollection containing all matching elements across all PDFs.
         """
-        from natural_pdf.elements.collections import ElementCollection
+        # Validation happens within pdf.find_all
         # Collect elements from all PDFs
         all_elements = []
         for pdf in self._pdfs:
             try:
-                # Explicitly pass the relevant arguments down
+                # Pass the relevant arguments down to each PDF's find_all
                 elements = pdf.find_all(
-                    selector,
+                    selector=selector,
+                    text=text,
                     apply_exclusions=apply_exclusions,
                     regex=regex,
                     case=case,
-                    **kwargs
+                    **kwargs,
                 )
                 all_elements.extend(elements.elements)
             except Exception as e:
                 logger.error(f"Error finding elements in {pdf.path}: {e}", exc_info=True)
         return ElementCollection(all_elements)
     def apply_ocr(
@@ -330,24 +375,26 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
             replace: If True, replace existing OCR elements
             options: Engine-specific options
             pages: Specific pages to process (None for all pages)
-            max_workers: Maximum number of threads to process PDFs concurrently.
+            max_workers: Maximum number of threads to process PDFs concurrently.
                          If None or 1, processing is sequential. (default: None)
         Returns:
             Self for method chaining
         """
         PDF = self._get_pdf_class()
-        logger.info(f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})...")
+        logger.info(
+            f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})..."
+        )
         # Worker function takes PDF object again
         def _process_pdf(pdf: PDF):
             """Helper function to apply OCR to a single PDF, handling errors."""
-            thread_id = threading.current_thread().name # Get thread name for logging
-            pdf_path = pdf.path # Get path for logging
+            thread_id = threading.current_thread().name  # Get thread name for logging
+            pdf_path = pdf.path  # Get path for logging
             logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
             start_time = time.monotonic()
             try:
-                pdf.apply_ocr( # Call apply_ocr on the original PDF object
+                pdf.apply_ocr(  # Call apply_ocr on the original PDF object
                     pages=pages,
                     engine=engine,
                     languages=languages,
@@ -362,17 +409,24 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
                     # For now, PDF.apply_ocr doesn't have it.
                 )
                 end_time = time.monotonic()
-                logger.debug(f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
+                logger.debug(
+                    f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
+                )
                 return pdf_path, None
             except Exception as e:
                 end_time = time.monotonic()
-                logger.error(f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
-                return pdf_path, e # Return path and error
+                logger.error(
+                    f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
+                    exc_info=False,
+                )
+                return pdf_path, e  # Return path and error
         # Use ThreadPoolExecutor for parallel processing if max_workers > 1
         if max_workers is not None and max_workers > 1:
             futures = []
-            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="OCRWorker") as executor:
+            with concurrent.futures.ThreadPoolExecutor(
+                max_workers=max_workers, thread_name_prefix="OCRWorker"
+            ) as executor:
                 for pdf in self._pdfs:
                     # Submit the PDF object to the worker function
                     futures.append(executor.submit(_process_pdf, pdf))
@@ -382,22 +436,22 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
                 concurrent.futures.as_completed(futures),
                 total=len(self._pdfs),
                 desc="Applying OCR (Parallel)",
-                unit="pdf"
+                unit="pdf",
             )
             for future in progress_bar:
-                pdf_path, error = future.result() # Get result (or exception)
+                pdf_path, error = future.result()  # Get result (or exception)
                 if error:
                     progress_bar.set_postfix_str(f"Error: {pdf_path}", refresh=True)
                 # Progress is updated automatically by tqdm
-        else: # Sequential processing (max_workers is None or 1)
+        else:  # Sequential processing (max_workers is None or 1)
             logger.info("Applying OCR sequentially...")
             # Use the selected tqdm class for sequential too for consistency
             # Iterate over PDF objects directly for sequential
             for pdf in tqdm(self._pdfs, desc="Applying OCR (Sequential)", unit="pdf"):
-                _process_pdf(pdf) # Call helper directly with PDF object
+                _process_pdf(pdf)  # Call helper directly with PDF object
         logger.info("Finished applying OCR across the collection.")
         return self
@@ -421,7 +475,7 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
         Returns:
             Self for method chaining.
         """
-        PDF = self._get_pdf_class() # Ensure PDF class is available
+        PDF = self._get_pdf_class()  # Ensure PDF class is available
         if not callable(correction_callback):
             raise TypeError("`correction_callback` must be a callable function.")
@@ -436,7 +490,9 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
             return self
         total_elements = len(all_ocr_elements)
-        logger.info(f"Found {total_elements} OCR elements across the collection. Starting correction process...")
+        logger.info(
+            f"Found {total_elements} OCR elements across the collection. Starting correction process..."
+        )
         # 2. Initialize the progress bar
         progress_bar = tqdm(total=total_elements, desc="Correcting OCR Elements", unit="element")
@@ -450,11 +506,14 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
                 pdf.correct_ocr(
                     correction_callback=correction_callback,
                     max_workers=max_workers,
-                    progress_callback=progress_bar.update # Pass the bar's update method
+                    progress_callback=progress_bar.update,  # Pass the bar's update method
                 )
             except Exception as e:
-                 logger.error(f"Error occurred during correction process for PDF {pdf.path}: {e}", exc_info=True)
-                 # Decide if we should stop or continue? For now, continue.
+                logger.error(
+                    f"Error occurred during correction process for PDF {pdf.path}: {e}",
+                    exc_info=True,
+                )
+                # Decide if we should stop or continue? For now, continue.
         progress_bar.close()
@@ -544,7 +603,9 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
         if not categories:
             raise ValueError("Categories list cannot be empty.")
-        logger.info(f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')...")
+        logger.info(
+            f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')..."
+        )
         # Calculate total pages for the progress bar
         total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
@@ -553,9 +614,7 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
             return self
         progress_bar = tqdm(
-            total=total_pages,
-            desc=f"Classifying Pages (model: {model})",
-            unit="page"
+            total=total_pages, desc=f"Classifying Pages (model: {model})", unit="page"
         )
         # Worker function
@@ -570,15 +629,20 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
                     categories=categories,
                     model=model,
                     progress_callback=progress_bar.update,
-                    **kwargs
+                    **kwargs,
                 )
                 end_time = time.monotonic()
-                logger.debug(f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
-                return pdf_path, None # Return path and no error
+                logger.debug(
+                    f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
+                )
+                return pdf_path, None  # Return path and no error
             except Exception as e:
                 end_time = time.monotonic()
                 # Error is logged within classify_pages, but log summary here
-                logger.error(f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
+                logger.error(
+                    f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
+                    exc_info=False,
+                )
                 # Close progress bar immediately on error to avoid hanging
                 progress_bar.close()
                 # Re-raise the exception to stop the entire collection processing
@@ -589,16 +653,18 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
             if max_workers is not None and max_workers > 1:
                 logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
                 futures = []
-                with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="ClassifyWorker") as executor:
+                with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=max_workers, thread_name_prefix="ClassifyWorker"
+                ) as executor:
                     for pdf in self._pdfs:
                         futures.append(executor.submit(_process_pdf_classification, pdf))
                     # Wait for all futures to complete (progress updated by callback)
                     # Exceptions are raised by future.result() if worker failed
                     for future in concurrent.futures.as_completed(futures):
-                         future.result() # Raise exception if worker failed
+                        future.result()  # Raise exception if worker failed
-            else: # Sequential processing
+            else:  # Sequential processing
                 logger.info("Classifying PDFs sequentially.")
                 for pdf in self._pdfs:
                     _process_pdf_classification(pdf)
@@ -606,12 +672,59 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
             logger.info("Finished classification across the collection.")
         finally:
-             # Ensure progress bar is closed even if errors occurred elsewhere
-             if not progress_bar.disable and progress_bar.n < progress_bar.total:
-                 progress_bar.close()
-             elif progress_bar.disable is False:
-                  progress_bar.close()
+            # Ensure progress bar is closed even if errors occurred elsewhere
+            if not progress_bar.disable and progress_bar.n < progress_bar.total:
+                progress_bar.close()
+            elif progress_bar.disable is False:
+                progress_bar.close()
         return self
     # --- End Classification Method --- #
+    def _gather_analysis_data(
+        self,
+        analysis_keys: List[str],
+        include_content: bool,
+        include_images: bool,
+        image_dir: Optional[Path],
+        image_format: str,
+        image_resolution: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        Gather analysis data from all PDFs in the collection.
+        Args:
+            analysis_keys: Keys in the analyses dictionary to export
+            include_content: Whether to include extracted text
+            include_images: Whether to export images
+            image_dir: Directory to save images
+            image_format: Format to save images
+            image_resolution: Resolution for exported images
+        Returns:
+            List of dictionaries containing analysis data
+        """
+        if not self._pdfs:
+            logger.warning("No PDFs found in collection")
+            return []
+        all_data = []
+        for pdf in tqdm(self._pdfs, desc="Gathering PDF data", leave=False):
+            # PDF level data
+            pdf_data = {
+                "pdf_path": pdf.path,
+                "pdf_filename": Path(pdf.path).name,
+                "total_pages": len(pdf.pages) if hasattr(pdf, "pages") else 0,
+            }
+            # Add metadata if available
+            if hasattr(pdf, "metadata") and pdf.metadata:
+                for k, v in pdf.metadata.items():
+                    if v:  # Only add non-empty metadata
+                        pdf_data[f"metadata.{k}"] = str(v)
+            all_data.append(pdf_data)
+        return all_data

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -544,56 +544,56 @@ class ElementManager:
         """
         Remove all elements with source="ocr" from the elements dictionary.
         This should be called before adding new OCR elements if replacement is desired.
         Returns:
             int: Number of OCR elements removed
         """
         # Load elements if not already loaded
         self.load_elements()
         removed_count = 0
         # Filter out OCR elements from words
         if "words" in self._elements:
             original_len = len(self._elements["words"])
             self._elements["words"] = [
-                word for word in self._elements["words"]
-                if getattr(word, "source", None) != "ocr"
+                word for word in self._elements["words"] if getattr(word, "source", None) != "ocr"
             ]
             removed_count += original_len - len(self._elements["words"])
         # Filter out OCR elements from chars
         if "chars" in self._elements:
             original_len = len(self._elements["chars"])
             self._elements["chars"] = [
-                char for char in self._elements["chars"]
-                if (isinstance(char, dict) and char.get("source") != "ocr") or
-                   (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
+                char
+                for char in self._elements["chars"]
+                if (isinstance(char, dict) and char.get("source") != "ocr")
+                or (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
             ]
             removed_count += original_len - len(self._elements["chars"])
         logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
         return removed_count
     def remove_element(self, element, element_type="words"):
         """
         Remove a specific element from the managed elements.
         Args:
             element: The element to remove
             element_type: The type of element ('words', 'chars', etc.)
         Returns:
             bool: True if removed successfully, False otherwise
         """
         # Load elements if not already loaded
         self.load_elements()
         # Check if the collection exists
         if element_type not in self._elements:
             logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
             return False
         # Try to remove the element
         try:
             if element in self._elements[element_type]:
@@ -606,3 +606,19 @@ class ElementManager:
         except Exception as e:
             logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
             return False
+    def has_elements(self) -> bool:
+        """
+        Check if any significant elements (words, rects, lines, regions)
+        have been loaded or added.
+        Returns:
+            True if any elements exist, False otherwise.
+        """
+        self.load_elements()
+        for key in ["words", "rects", "lines", "regions"]:
+            if self._elements.get(key):
+                return True
+        return False

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -18,7 +18,12 @@ except ImportError:
     Page = Any  # Fallback if circular import issue arises during type checking
 # Import ColorManager and related utils
-from natural_pdf.utils.visualization import ColorManager, create_legend, merge_images_with_legend
+from natural_pdf.utils.visualization import (
+    ColorManager,
+    create_legend,
+    merge_images_with_legend,
+    render_plain_page,
+)
 # Constants for drawing (Can be potentially moved to ColorManager/Renderer if desired)
 BORDER_ALPHA = 180  # Default alpha for highlight border
@@ -622,28 +627,14 @@ class HighlightingService:
             return None
         page = self._pdf[page_index]
-        highlights_on_page = self.get_highlights_for_page(
-            page_index
-        )  # This list will be empty if clear_page was called
+        highlights_on_page = self.get_highlights_for_page(page_index)
-        # --- Get Base Image ---
-        try:
-            render_resolution = resolution if resolution is not None else scale * 72
-            img_object = page._page.to_image(resolution=render_resolution, **kwargs)
-            base_image = img_object.annotated
-            if not isinstance(base_image, Image.Image):
-                png_data = img_object._repr_png_()
-                if png_data:
-                    base_image = Image.open(io.BytesIO(png_data)).convert("RGB")
-                else:
-                    raise ValueError("Could not extract base PIL image from pdfplumber.")
-            base_image = base_image.convert("RGBA")
-            logger.debug(
-                f"Base image for page {page_index} rendered with resolution {render_resolution}."
-            )
-        except Exception as e:
-            logger.error(f"Failed to render base image for page {page_index}: {e}", exc_info=True)
-            return None
+        render_resolution = resolution if resolution is not None else scale * 72
+        base_image = render_plain_page(page, render_resolution)
+        base_image = base_image.convert("RGBA")
+        logger.debug(
+            f"Base image for page {page_index} rendered with resolution {render_resolution}."
+        )
         # --- Render Highlights ---
         rendered_image: Image.Image

natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl