PyPI - natural-pdf - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

natural-pdf 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +241 -158
natural_pdf/classification/mixin.py +52 -38
natural_pdf/classification/results.py +71 -45
natural_pdf/collections/mixins.py +85 -20
natural_pdf/collections/pdf_collection.py +245 -100
natural_pdf/core/element_manager.py +30 -14
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +423 -101
natural_pdf/core/pdf.py +694 -195
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +610 -134
natural_pdf/elements/region.py +659 -90
natural_pdf/elements/text.py +1 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +4 -3
natural_pdf/extraction/manager.py +50 -49
natural_pdf/extraction/mixin.py +90 -57
natural_pdf/extraction/result.py +9 -23
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +61 -25
natural_pdf/ocr/ocr_options.py +70 -10
natural_pdf/ocr/utils.py +6 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +219 -143
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +1 -1
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +24 -16
natural_pdf/utils/tqdm_utils.py +18 -10
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
natural_pdf-0.1.10.dist-info/RECORD +80 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/categorizing-documents/index.md +0 -168
docs/data-extraction/index.md +0 -87
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -969
docs/element-selection/index.md +0 -249
docs/finetuning/index.md +0 -176
docs/index.md +0 -189
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -256
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -417
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -152
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -119
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -275
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -337
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -293
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -414
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -513
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2439
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -517
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -3712
docs/tutorials/12-ocr-integration.md +0 -137
docs/tutorials/13-semantic-search.ipynb +0 -1718
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.8.dist-info/RECORD +0 -156
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0

natural_pdf/collections/pdf_collection.py CHANGED Viewed

@@ -1,13 +1,28 @@
+import concurrent.futures  # Import concurrent.futures
 import copy  # Added for copying options
 import glob as py_glob
 import logging
 import os
 import re  # Added for safe path generation
+import threading  # Import threading for logging thread information
+import time  # Import time for logging timestamps
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union, Callable
-import concurrent.futures # Import concurrent.futures
-import time # Import time for logging timestamps
-import threading # Import threading for logging thread information
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Type,
+    TypeVar,
+    Union,
+    overload,
+)
 from PIL import Image
 from tqdm import tqdm
@@ -26,6 +41,7 @@ logger = logging.getLogger(__name__)
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.region import Region
+from natural_pdf.export.mixin import ExportMixin
 # --- Search Imports ---
 try:
@@ -47,12 +63,12 @@ except ImportError as e:
     SearchServiceProtocol, SearchOptions, Indexable = object, object, object
-from natural_pdf.search.searchable_mixin import SearchableMixin  # Import the new mixin
 # Import the ApplyMixin
 from natural_pdf.collections.mixins import ApplyMixin
+from natural_pdf.search.searchable_mixin import SearchableMixin  # Import the new mixin
-class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
+class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixin
     def __init__(
         self,
         source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -252,54 +268,83 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
     def __repr__(self) -> str:
         # Removed search status
         return f"<PDFCollection(count={len(self._pdfs)})>"
+        return f"<PDFCollection(count={len(self._pdfs)})>"
     @property
     def pdfs(self) -> List["PDF"]:
         """Returns the list of PDF objects held by the collection."""
         return self._pdfs
+    @overload
+    def find_all(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    @overload
     def find_all(
-        self,
-        selector: str,
-        apply_exclusions: bool = True,  # Added explicit parameter
-        regex: bool = False,            # Added explicit parameter
-        case: bool = True,             # Added explicit parameter
-        **kwargs
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    def find_all(
+        self,
+        selector: Optional[str] = None,  # Now optional
+        *,
+        text: Optional[str] = None,  # New text parameter
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
     ) -> "ElementCollection":
         """
-        Find all elements matching the selector across all PDFs in the collection.
+        Find all elements matching the selector OR text across all PDFs in the collection.
+        Provide EITHER `selector` OR `text`, but not both.
         This creates an ElementCollection that can span multiple PDFs. Note that
         some ElementCollection methods have limitations when spanning PDFs.
         Args:
-            selector: CSS-like selector string to query elements
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            regex: Whether to use regex for text search in :contains (default: False)
-            case: Whether to do case-sensitive text search (default: True)
-            **kwargs: Additional keyword arguments passed to the find_all method of each PDF
+            selector: CSS-like selector string to query elements.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional keyword arguments passed to the find_all method of each PDF.
         Returns:
-            ElementCollection containing all matching elements across all PDFs
+            ElementCollection containing all matching elements across all PDFs.
         """
-        from natural_pdf.elements.collections import ElementCollection
+        # Validation happens within pdf.find_all
         # Collect elements from all PDFs
         all_elements = []
         for pdf in self._pdfs:
             try:
-                # Explicitly pass the relevant arguments down
+                # Pass the relevant arguments down to each PDF's find_all
                 elements = pdf.find_all(
-                    selector,
+                    selector=selector,
+                    text=text,
                     apply_exclusions=apply_exclusions,
                     regex=regex,
                     case=case,
-                    **kwargs
+                    **kwargs,
                 )
                 all_elements.extend(elements.elements)
             except Exception as e:
                 logger.error(f"Error finding elements in {pdf.path}: {e}", exc_info=True)
         return ElementCollection(all_elements)
     def apply_ocr(
@@ -330,24 +375,26 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
             replace: If True, replace existing OCR elements
             options: Engine-specific options
             pages: Specific pages to process (None for all pages)
-            max_workers: Maximum number of threads to process PDFs concurrently.
+            max_workers: Maximum number of threads to process PDFs concurrently.
                          If None or 1, processing is sequential. (default: None)
         Returns:
             Self for method chaining
         """
         PDF = self._get_pdf_class()
-        logger.info(f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})...")
+        logger.info(
+            f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})..."
+        )
         # Worker function takes PDF object again
         def _process_pdf(pdf: PDF):
             """Helper function to apply OCR to a single PDF, handling errors."""
-            thread_id = threading.current_thread().name # Get thread name for logging
-            pdf_path = pdf.path # Get path for logging
+            thread_id = threading.current_thread().name  # Get thread name for logging
+            pdf_path = pdf.path  # Get path for logging
             logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
             start_time = time.monotonic()
             try:
-                pdf.apply_ocr( # Call apply_ocr on the original PDF object
+                pdf.apply_ocr(  # Call apply_ocr on the original PDF object
                     pages=pages,
                     engine=engine,
                     languages=languages,
@@ -362,17 +409,24 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
                     # For now, PDF.apply_ocr doesn't have it.
                 )
                 end_time = time.monotonic()
-                logger.debug(f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
+                logger.debug(
+                    f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
+                )
                 return pdf_path, None
             except Exception as e:
                 end_time = time.monotonic()
-                logger.error(f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
-                return pdf_path, e # Return path and error
+                logger.error(
+                    f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
+                    exc_info=False,
+                )
+                return pdf_path, e  # Return path and error
         # Use ThreadPoolExecutor for parallel processing if max_workers > 1
         if max_workers is not None and max_workers > 1:
             futures = []
-            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="OCRWorker") as executor:
+            with concurrent.futures.ThreadPoolExecutor(
+                max_workers=max_workers, thread_name_prefix="OCRWorker"
+            ) as executor:
                 for pdf in self._pdfs:
                     # Submit the PDF object to the worker function
                     futures.append(executor.submit(_process_pdf, pdf))
@@ -382,22 +436,22 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
                 concurrent.futures.as_completed(futures),
                 total=len(self._pdfs),
                 desc="Applying OCR (Parallel)",
-                unit="pdf"
+                unit="pdf",
             )
             for future in progress_bar:
-                pdf_path, error = future.result() # Get result (or exception)
+                pdf_path, error = future.result()  # Get result (or exception)
                 if error:
                     progress_bar.set_postfix_str(f"Error: {pdf_path}", refresh=True)
                 # Progress is updated automatically by tqdm
-        else: # Sequential processing (max_workers is None or 1)
+        else:  # Sequential processing (max_workers is None or 1)
             logger.info("Applying OCR sequentially...")
             # Use the selected tqdm class for sequential too for consistency
             # Iterate over PDF objects directly for sequential
             for pdf in tqdm(self._pdfs, desc="Applying OCR (Sequential)", unit="pdf"):
-                _process_pdf(pdf) # Call helper directly with PDF object
+                _process_pdf(pdf)  # Call helper directly with PDF object
         logger.info("Finished applying OCR across the collection.")
         return self
@@ -421,7 +475,7 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
         Returns:
             Self for method chaining.
         """
-        PDF = self._get_pdf_class() # Ensure PDF class is available
+        PDF = self._get_pdf_class()  # Ensure PDF class is available
         if not callable(correction_callback):
             raise TypeError("`correction_callback` must be a callable function.")
@@ -436,7 +490,9 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
             return self
         total_elements = len(all_ocr_elements)
-        logger.info(f"Found {total_elements} OCR elements across the collection. Starting correction process...")
+        logger.info(
+            f"Found {total_elements} OCR elements across the collection. Starting correction process..."
+        )
         # 2. Initialize the progress bar
         progress_bar = tqdm(total=total_elements, desc="Correcting OCR Elements", unit="element")
@@ -450,17 +506,20 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
                 pdf.correct_ocr(
                     correction_callback=correction_callback,
                     max_workers=max_workers,
-                    progress_callback=progress_bar.update # Pass the bar's update method
+                    progress_callback=progress_bar.update,  # Pass the bar's update method
                 )
             except Exception as e:
-                 logger.error(f"Error occurred during correction process for PDF {pdf.path}: {e}", exc_info=True)
-                 # Decide if we should stop or continue? For now, continue.
+                logger.error(
+                    f"Error occurred during correction process for PDF {pdf.path}: {e}",
+                    exc_info=True,
+                )
+                # Decide if we should stop or continue? For now, continue.
         progress_bar.close()
         return self
-    def categorize(self, categories: List[str], **kwargs):
+    def categorize(self, labels: List[str], **kwargs):
         """Categorizes PDFs in the collection based on content or features."""
         # Implementation requires integrating with classification models or logic
         raise NotImplementedError("categorize requires classification implementation.")
@@ -511,107 +570,193 @@ class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
     # --- Classification Method --- #
     def classify_all(
         self,
-        categories: List[str],
-        model: str = "text",
+        labels: List[str],
+        using: Optional[str] = None,  # Default handled by PDF.classify -> manager
+        model: Optional[str] = None,  # Optional model ID
         max_workers: Optional[int] = None,
+        analysis_key: str = "classification",  # Key for storing result in PDF.analyses
         **kwargs,
     ) -> "PDFCollection":
         """
-        Classify all pages across all PDFs in the collection, potentially in parallel.
+        Classify each PDF document in the collection, potentially in parallel.
-        This method uses the unified `classify_all` approach, delegating page
-        classification to each PDF's `classify_pages` method.
-        It displays a progress bar tracking individual pages.
+        This method delegates classification to each PDF object's `classify` method.
+        By default, uses the full extracted text of the PDF.
+        If `using='vision'`, it classifies the first page's image, but ONLY if
+        the PDF has a single page (raises ValueError otherwise).
         Args:
-            categories: A list of string category names.
-            model: Model identifier ('text', 'vision', or specific HF ID).
+            labels: A list of string category names.
+            using: Processing mode ('text', 'vision'). If None, manager infers (defaulting to text).
+            model: Optional specific model identifier (e.g., HF ID). If None, manager uses default for 'using' mode.
             max_workers: Maximum number of threads to process PDFs concurrently.
                          If None or 1, processing is sequential.
-            **kwargs: Additional arguments passed down to `pdf.classify_pages` and
-                      subsequently to `page.classify` (e.g., device,
-                      confidence_threshold, resolution).
+            analysis_key: Key under which to store the ClassificationResult in each PDF's `analyses` dict.
+            **kwargs: Additional arguments passed down to `pdf.classify` (e.g., device,
+                      min_confidence, multi_label, text extraction options).
         Returns:
             Self for method chaining.
         Raises:
-            ValueError: If categories list is empty.
-            ClassificationError: If classification fails for any page (will stop processing).
+            ValueError: If labels list is empty, or if using='vision' on a multi-page PDF.
+            ClassificationError: If classification fails for any PDF (will stop processing).
             ImportError: If classification dependencies are missing.
         """
         PDF = self._get_pdf_class()
-        if not categories:
-            raise ValueError("Categories list cannot be empty.")
-        logger.info(f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')...")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
-        # Calculate total pages for the progress bar
-        total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
-        if total_pages == 0:
-            logger.warning("No pages found in the PDF collection to classify.")
+        if not self._pdfs:
+            logger.warning("PDFCollection is empty, skipping classification.")
             return self
+        mode_desc = f"using='{using}'" if using else f"model='{model}'" if model else "default text"
+        logger.info(
+            f"Starting classification for {len(self._pdfs)} PDFs in collection ({mode_desc})..."
+        )
         progress_bar = tqdm(
-            total=total_pages,
-            desc=f"Classifying Pages (model: {model})",
-            unit="page"
+            total=len(self._pdfs), desc=f"Classifying PDFs ({mode_desc})", unit="pdf"
         )
         # Worker function
         def _process_pdf_classification(pdf: PDF):
             thread_id = threading.current_thread().name
             pdf_path = pdf.path
-            logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
+            logger.debug(f"[{thread_id}] Starting classification process for PDF: {pdf_path}")
             start_time = time.monotonic()
             try:
-                # Call classify_pages on the PDF, passing the progress callback
-                pdf.classify_pages(
-                    categories=categories,
+                # Call classify directly on the PDF object
+                pdf.classify(
+                    labels=labels,
+                    using=using,
                     model=model,
-                    progress_callback=progress_bar.update,
-                    **kwargs
+                    analysis_key=analysis_key,
+                    **kwargs,  # Pass other relevant args like min_confidence, multi_label
+                )
+                end_time = time.monotonic()
+                logger.debug(
+                    f"[{thread_id}] Finished classification for PDF: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
                 )
+                progress_bar.update(1)  # Update progress bar upon success
+                return pdf_path, None  # Return path and no error
+            except ValueError as ve:
+                # Catch specific error for vision on multi-page PDF
                 end_time = time.monotonic()
-                logger.debug(f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
-                return pdf_path, None # Return path and no error
+                logger.error(
+                    f"[{thread_id}] Skipped classification for {pdf_path} after {end_time - start_time:.2f}s: {ve}",
+                    exc_info=False,
+                )
+                progress_bar.update(1)  # Still update progress bar
+                return pdf_path, ve  # Return the specific ValueError
             except Exception as e:
                 end_time = time.monotonic()
-                # Error is logged within classify_pages, but log summary here
-                logger.error(f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
-                # Close progress bar immediately on error to avoid hanging
-                progress_bar.close()
+                logger.error(
+                    f"[{thread_id}] Failed classification process for PDF {pdf_path} after {end_time - start_time:.2f}s: {e}",
+                    exc_info=True,  # Log full traceback for unexpected errors
+                )
+                # Close progress bar immediately on critical error to avoid hanging
+                if not progress_bar.disable:
+                    progress_bar.close()
                 # Re-raise the exception to stop the entire collection processing
-                raise
+                raise ClassificationError(f"Classification failed for {pdf_path}: {e}") from e
         # Use ThreadPoolExecutor for parallel processing if max_workers > 1
+        processed_count = 0
+        skipped_count = 0
         try:
             if max_workers is not None and max_workers > 1:
                 logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
                 futures = []
-                with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="ClassifyWorker") as executor:
+                with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=max_workers, thread_name_prefix="ClassifyWorker"
+                ) as executor:
                     for pdf in self._pdfs:
                         futures.append(executor.submit(_process_pdf_classification, pdf))
-                    # Wait for all futures to complete (progress updated by callback)
-                    # Exceptions are raised by future.result() if worker failed
+                    # Wait for all futures to complete
+                    # Progress updated within worker
                     for future in concurrent.futures.as_completed(futures):
-                         future.result() # Raise exception if worker failed
-            else: # Sequential processing
+                        processed_count += 1
+                        pdf_path, error = (
+                            future.result()
+                        )  # Raise ClassificationError if worker failed critically
+                        if isinstance(error, ValueError):
+                            # Logged in worker, just count as skipped
+                            skipped_count += 1
+            else:  # Sequential processing
                 logger.info("Classifying PDFs sequentially.")
                 for pdf in self._pdfs:
-                    _process_pdf_classification(pdf)
-            logger.info("Finished classification across the collection.")
+                    processed_count += 1
+                    pdf_path, error = _process_pdf_classification(
+                        pdf
+                    )  # Raise ClassificationError if worker failed critically
+                    if isinstance(error, ValueError):
+                        skipped_count += 1
+            final_message = (
+                f"Finished classification across the collection. Processed: {processed_count}"
+            )
+            if skipped_count > 0:
+                final_message += f", Skipped (e.g., vision on multi-page): {skipped_count}"
+            logger.info(final_message + ".")
         finally:
-             # Ensure progress bar is closed even if errors occurred elsewhere
-             if not progress_bar.disable and progress_bar.n < progress_bar.total:
-                 progress_bar.close()
-             elif progress_bar.disable is False:
-                  progress_bar.close()
+            # Ensure progress bar is closed properly
+            if not progress_bar.disable and progress_bar.n < progress_bar.total:
+                progress_bar.n = progress_bar.total  # Ensure it reaches 100%
+            if not progress_bar.disable:
+                progress_bar.close()
         return self
     # --- End Classification Method --- #
+    def _gather_analysis_data(
+        self,
+        analysis_keys: List[str],
+        include_content: bool,
+        include_images: bool,
+        image_dir: Optional[Path],
+        image_format: str,
+        image_resolution: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        Gather analysis data from all PDFs in the collection.
+        Args:
+            analysis_keys: Keys in the analyses dictionary to export
+            include_content: Whether to include extracted text
+            include_images: Whether to export images
+            image_dir: Directory to save images
+            image_format: Format to save images
+            image_resolution: Resolution for exported images
+        Returns:
+            List of dictionaries containing analysis data
+        """
+        if not self._pdfs:
+            logger.warning("No PDFs found in collection")
+            return []
+        all_data = []
+        for pdf in tqdm(self._pdfs, desc="Gathering PDF data", leave=False):
+            # PDF level data
+            pdf_data = {
+                "pdf_path": pdf.path,
+                "pdf_filename": Path(pdf.path).name,
+                "total_pages": len(pdf.pages) if hasattr(pdf, "pages") else 0,
+            }
+            # Add metadata if available
+            if hasattr(pdf, "metadata") and pdf.metadata:
+                for k, v in pdf.metadata.items():
+                    if v:  # Only add non-empty metadata
+                        pdf_data[f"metadata.{k}"] = str(v)
+            all_data.append(pdf_data)
+        return all_data

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -544,56 +544,56 @@ class ElementManager:
         """
         Remove all elements with source="ocr" from the elements dictionary.
         This should be called before adding new OCR elements if replacement is desired.
         Returns:
             int: Number of OCR elements removed
         """
         # Load elements if not already loaded
         self.load_elements()
         removed_count = 0
         # Filter out OCR elements from words
         if "words" in self._elements:
             original_len = len(self._elements["words"])
             self._elements["words"] = [
-                word for word in self._elements["words"]
-                if getattr(word, "source", None) != "ocr"
+                word for word in self._elements["words"] if getattr(word, "source", None) != "ocr"
             ]
             removed_count += original_len - len(self._elements["words"])
         # Filter out OCR elements from chars
         if "chars" in self._elements:
             original_len = len(self._elements["chars"])
             self._elements["chars"] = [
-                char for char in self._elements["chars"]
-                if (isinstance(char, dict) and char.get("source") != "ocr") or
-                   (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
+                char
+                for char in self._elements["chars"]
+                if (isinstance(char, dict) and char.get("source") != "ocr")
+                or (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
             ]
             removed_count += original_len - len(self._elements["chars"])
         logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
         return removed_count
     def remove_element(self, element, element_type="words"):
         """
         Remove a specific element from the managed elements.
         Args:
             element: The element to remove
             element_type: The type of element ('words', 'chars', etc.)
         Returns:
             bool: True if removed successfully, False otherwise
         """
         # Load elements if not already loaded
         self.load_elements()
         # Check if the collection exists
         if element_type not in self._elements:
             logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
             return False
         # Try to remove the element
         try:
             if element in self._elements[element_type]:
@@ -606,3 +606,19 @@ class ElementManager:
         except Exception as e:
             logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
             return False
+    def has_elements(self) -> bool:
+        """
+        Check if any significant elements (words, rects, lines, regions)
+        have been loaded or added.
+        Returns:
+            True if any elements exist, False otherwise.
+        """
+        self.load_elements()
+        for key in ["words", "rects", "lines", "regions"]:
+            if self._elements.get(key):
+                return True
+        return False

natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

natural-pdf 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl