PyPI - natural-pdf - Versions diffs - 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

natural-pdf 0.1.33py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +670 -595
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +131 -45
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +113 -22
natural_pdf/core/pdf.py +477 -75
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +222 -108
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
natural_pdf-0.1.34.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.33.dist-info/RECORD +0 -118
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0

natural_pdf/classification/manager.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 import time
+import threading  # Add threading for locks
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
@@ -9,6 +10,7 @@ from PIL import Image
 # Use try-except for robustness if dependencies are missing
 _CLASSIFICATION_AVAILABLE = None
 def _check_classification_dependencies():
     """Lazy check for classification dependencies."""
     global _CLASSIFICATION_AVAILABLE
@@ -16,16 +18,20 @@ def _check_classification_dependencies():
         try:
             import torch
             import transformers
             _CLASSIFICATION_AVAILABLE = True
         except ImportError:
             _CLASSIFICATION_AVAILABLE = False
     return _CLASSIFICATION_AVAILABLE
 def _get_torch():
     """Lazy import for torch."""
     import torch
     return torch
 def _get_transformers_components():
     """Lazy import for transformers components."""
     from transformers import (
@@ -34,13 +40,15 @@ def _get_transformers_components():
         AutoTokenizer,
         pipeline,
     )
     return {
-        'AutoModelForSequenceClassification': AutoModelForSequenceClassification,
-        'AutoModelForZeroShotImageClassification': AutoModelForZeroShotImageClassification,
-        'AutoTokenizer': AutoTokenizer,
-        'pipeline': pipeline,
+        "AutoModelForSequenceClassification": AutoModelForSequenceClassification,
+        "AutoModelForZeroShotImageClassification": AutoModelForZeroShotImageClassification,
+        "AutoTokenizer": AutoTokenizer,
+        "pipeline": pipeline,
     }
 from tqdm.auto import tqdm
 # Import result classes
@@ -52,10 +60,12 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-# Global cache for models/pipelines
+# Global cache for models/pipelines with thread safety
 _PIPELINE_CACHE: Dict[str, "Pipeline"] = {}
 _TOKENIZER_CACHE: Dict[str, Any] = {}
 _MODEL_CACHE: Dict[str, Any] = {}
+_CACHE_LOCK = threading.RLock()  # Reentrant lock for thread safety
 # Export the availability check function for external use
 def is_classification_available() -> bool:
@@ -107,34 +117,35 @@ class ClassificationManager:
     def _get_pipeline(self, model_id: str, using: str) -> "Pipeline":
         """Get or create a classification pipeline."""
         cache_key = f"{model_id}_{using}_{self.device}"
-        if cache_key not in _PIPELINE_CACHE:
-            logger.info(
-                f"Loading {using} classification pipeline for model '{model_id}' on device '{self.device}'..."
-            )
-            start_time = time.time()
-            try:
-                # Lazy import transformers components
-                transformers_components = _get_transformers_components()
-                pipeline = transformers_components['pipeline']
-                task = (
-                    "zero-shot-classification"
-                    if using == "text"
-                    else "zero-shot-image-classification"
-                )
-                _PIPELINE_CACHE[cache_key] = pipeline(task, model=model_id, device=self.device)
-                end_time = time.time()
+        with _CACHE_LOCK:
+            if cache_key not in _PIPELINE_CACHE:
                 logger.info(
-                    f"Pipeline for '{model_id}' loaded in {end_time - start_time:.2f} seconds."
-                )
-            except Exception as e:
-                logger.error(
-                    f"Failed to load pipeline for model '{model_id}' (using: {using}): {e}",
-                    exc_info=True,
+                    f"Loading {using} classification pipeline for model '{model_id}' on device '{self.device}'..."
                 )
-                raise ClassificationError(
-                    f"Failed to load pipeline for model '{model_id}'. Ensure the model ID is correct and supports the {task} task."
-                ) from e
+                start_time = time.time()
+                try:
+                    # Lazy import transformers components
+                    transformers_components = _get_transformers_components()
+                    pipeline = transformers_components["pipeline"]
+                    task = (
+                        "zero-shot-classification"
+                        if using == "text"
+                        else "zero-shot-image-classification"
+                    )
+                    _PIPELINE_CACHE[cache_key] = pipeline(task, model=model_id, device=self.device)
+                    end_time = time.time()
+                    logger.info(
+                        f"Pipeline for '{model_id}' loaded in {end_time - start_time:.2f} seconds."
+                    )
+                except Exception as e:
+                    logger.error(
+                        f"Failed to load pipeline for model '{model_id}' (using: {using}): {e}",
+                        exc_info=True,
+                    )
+                    raise ClassificationError(
+                        f"Failed to load pipeline for model '{model_id}'. Ensure the model ID is correct and supports the {task} task."
+                    ) from e
         return _PIPELINE_CACHE[cache_key]
     def infer_using(self, model_id: str, using: Optional[str] = None) -> str:
@@ -452,66 +463,70 @@ class ClassificationManager:
     def cleanup_models(self, model_id: Optional[str] = None) -> int:
         """
         Cleanup classification models to free memory.
         Args:
             model_id: Specific model to cleanup, or None to cleanup all models
         Returns:
             Number of models cleaned up
         """
         global _PIPELINE_CACHE, _TOKENIZER_CACHE, _MODEL_CACHE
         cleaned_count = 0
         if model_id:
             # Cleanup specific model - search cache keys that contain the model_id
-            keys_to_remove = [key for key in _PIPELINE_CACHE.keys() if model_id in key]
-            for key in keys_to_remove:
-                pipeline = _PIPELINE_CACHE.pop(key, None)
-                if pipeline and hasattr(pipeline, 'model'):
-                    # Try to cleanup GPU memory if using torch
-                    try:
-                        torch = _get_torch()
-                        if hasattr(pipeline.model, 'to'):
-                            pipeline.model.to('cpu')  # Move to CPU
-                        if torch.cuda.is_available():
-                            torch.cuda.empty_cache()  # Clear GPU cache
-                    except Exception as e:
-                        logger.debug(f"GPU cleanup failed for model {model_id}: {e}")
-                cleaned_count += 1
-                logger.info(f"Cleaned up classification pipeline: {key}")
+            with _CACHE_LOCK:
+                keys_to_remove = [key for key in _PIPELINE_CACHE.keys() if model_id in key]
+                for key in keys_to_remove:
+                    pipeline = _PIPELINE_CACHE.pop(key, None)
+                    if pipeline and hasattr(pipeline, "model"):
+                        # Try to cleanup GPU memory if using torch
+                        try:
+                            torch = _get_torch()
+                            if hasattr(pipeline.model, "to"):
+                                pipeline.model.to("cpu")  # Move to CPU
+                            if torch.cuda.is_available():
+                                torch.cuda.empty_cache()  # Clear GPU cache
+                        except Exception as e:
+                            logger.debug(f"GPU cleanup failed for model {model_id}: {e}")
+                        cleaned_count += 1
+                        logger.info(f"Cleaned up classification pipeline: {key}")
             # Also cleanup tokenizer and model caches for this model
-            tokenizer_keys = [key for key in _TOKENIZER_CACHE.keys() if model_id in key]
-            for key in tokenizer_keys:
-                _TOKENIZER_CACHE.pop(key, None)
-            model_keys = [key for key in _MODEL_CACHE.keys() if model_id in key]
-            for key in model_keys:
-                _MODEL_CACHE.pop(key, None)
+            with _CACHE_LOCK:
+                tokenizer_keys = [key for key in _TOKENIZER_CACHE.keys() if model_id in key]
+                for key in tokenizer_keys:
+                    _TOKENIZER_CACHE.pop(key, None)
+                model_keys = [key for key in _MODEL_CACHE.keys() if model_id in key]
+                for key in model_keys:
+                    _MODEL_CACHE.pop(key, None)
         else:
             # Cleanup all models
-            for key, pipeline in list(_PIPELINE_CACHE.items()):
-                if hasattr(pipeline, 'model'):
-                    try:
-                        torch = _get_torch()
-                        if hasattr(pipeline.model, 'to'):
-                            pipeline.model.to('cpu')  # Move to CPU
-                        if torch.cuda.is_available():
-                            torch.cuda.empty_cache()  # Clear GPU cache
-                    except Exception as e:
-                        logger.debug(f"GPU cleanup failed for pipeline {key}: {e}")
+            with _CACHE_LOCK:
+                for key, pipeline in list(_PIPELINE_CACHE.items()):
+                    if hasattr(pipeline, "model"):
+                        try:
+                            torch = _get_torch()
+                            if hasattr(pipeline.model, "to"):
+                                pipeline.model.to("cpu")  # Move to CPU
+                            if torch.cuda.is_available():
+                                torch.cuda.empty_cache()  # Clear GPU cache
+                        except Exception as e:
+                            logger.debug(f"GPU cleanup failed for pipeline {key}: {e}")
             # Clear all caches
-            pipeline_count = len(_PIPELINE_CACHE)
-            _PIPELINE_CACHE.clear()
-            _TOKENIZER_CACHE.clear()
-            _MODEL_CACHE.clear()
+            with _CACHE_LOCK:
+                pipeline_count = len(_PIPELINE_CACHE)
+                _PIPELINE_CACHE.clear()
+                _TOKENIZER_CACHE.clear()
+                _MODEL_CACHE.clear()
             if pipeline_count > 0:
                 logger.info(f"Cleaned up {pipeline_count} classification models")
             cleaned_count = pipeline_count
         return cleaned_count

natural_pdf/classification/mixin.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import logging
+import warnings
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 from PIL import Image
-import warnings
 from .results import ClassificationResult
@@ -16,9 +16,51 @@ logger = logging.getLogger(__name__)
 class ClassificationMixin:
-    """
-    Mixin class providing classification capabilities to Page and Region objects.
-    Relies on a ClassificationManager being accessible, typically via the parent PDF.
+    """Mixin class providing classification capabilities to Page and Region objects.
+    This mixin adds AI-powered classification functionality to pages, regions, and
+    elements, enabling document categorization and content analysis using both
+    text-based and vision-based models. It integrates with the ClassificationManager
+    to provide a consistent interface across different model types.
+    The mixin supports both single-label and multi-label classification, confidence
+    thresholding, and various analysis storage strategies for complex workflows.
+    Results are stored in the host object's 'analyses' dictionary for retrieval
+    and further processing.
+    Classification modes:
+    - Text-based: Uses extracted text content for classification
+    - Vision-based: Uses rendered images for visual classification
+    - Automatic: Manager selects best mode based on content availability
+    Host class requirements:
+    - Must implement _get_classification_manager() -> ClassificationManager
+    - Must implement _get_classification_content() -> str | Image
+    - Must have 'analyses' attribute as Dict[str, Any]
+    Example:
+        ```python
+        pdf = npdf.PDF("document.pdf")
+        page = pdf.pages[0]
+        # Document type classification
+        page.classify(['invoice', 'contract', 'report'],
+                     model='text', analysis_key='doc_type')
+        # Multi-label content analysis
+        region = page.find('text:contains("Summary")').below()
+        region.classify(['technical', 'financial', 'legal'],
+                       multi_label=True, min_confidence=0.8)
+        # Access results
+        doc_type = page.analyses['doc_type']
+        content_labels = region.analyses['classification']
+        ```
+    Note:
+        Classification requires appropriate models to be available through the
+        ClassificationManager. Results include confidence scores and detailed
+        metadata for analysis workflows.
     """
     # --- Abstract methods/properties required by the host class --- #
@@ -86,7 +128,9 @@ class ClassificationMixin:
                 # Try text first
                 try:
                     tentative_text = self._get_classification_content("text", **kwargs)
-                    if tentative_text and not (isinstance(tentative_text, str) and tentative_text.isspace()):
+                    if tentative_text and not (
+                        isinstance(tentative_text, str) and tentative_text.isspace()
+                    ):
                         engine = "text"
                         content = tentative_text
                     else:

natural_pdf/classification/results.py CHANGED Viewed

@@ -1,9 +1,9 @@
 # natural_pdf/classification/results.py
 import logging
+from collections.abc import Mapping
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Dict, List, Optional
-from collections.abc import Mapping
 logger = logging.getLogger(__name__)

natural_pdf/cli.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import argparse
 import subprocess
 import sys
-from importlib.metadata import distribution, PackageNotFoundError, version as get_version
+from importlib.metadata import PackageNotFoundError, distribution
+from importlib.metadata import version as get_version
 from pathlib import Path
 from typing import Dict
 from packaging.requirements import Requirement
 # ---------------------------------------------------------------------------
@@ -71,7 +73,9 @@ def main():
     install_p = subparsers.add_parser(
         "install", help="Install optional dependency groups (e.g. paddle, surya)"
     )
-    install_p.add_argument("extras", nargs="+", help="One or more extras to install (e.g. paddle surya)")
+    install_p.add_argument(
+        "extras", nargs="+", help="One or more extras to install (e.g. paddle surya)"
+    )
     install_p.set_defaults(func=cmd_install)
     # list subcommand -------------------------------------------------------
@@ -113,4 +117,4 @@ def cmd_list(args):
 if __name__ == "__main__":
-    main()
+    main()

natural_pdf/collections/pdf_collection.py CHANGED Viewed

@@ -548,37 +548,31 @@ class PDFCollection(
         labels: List[str],
         using: Optional[str] = None,  # Default handled by PDF.classify -> manager
         model: Optional[str] = None,  # Optional model ID
-        max_workers: Optional[int] = None,
         analysis_key: str = "classification",  # Key for storing result in PDF.analyses
         **kwargs,
     ) -> "PDFCollection":
         """
-        Classify each PDF document in the collection, potentially in parallel.
+        Classify each PDF document in the collection using batch processing.
-        This method delegates classification to each PDF object's `classify` method.
-        By default, uses the full extracted text of the PDF.
-        If `using='vision'`, it classifies the first page's image, but ONLY if
-        the PDF has a single page (raises ValueError otherwise).
+        This method gathers content from all PDFs and processes them in a single
+        batch to avoid multiprocessing resource accumulation that can occur with
+        sequential individual classifications.
         Args:
             labels: A list of string category names.
             using: Processing mode ('text', 'vision'). If None, manager infers (defaulting to text).
             model: Optional specific model identifier (e.g., HF ID). If None, manager uses default for 'using' mode.
-            max_workers: Maximum number of threads to process PDFs concurrently.
-                         If None or 1, processing is sequential.
             analysis_key: Key under which to store the ClassificationResult in each PDF's `analyses` dict.
-            **kwargs: Additional arguments passed down to `pdf.classify` (e.g., device,
-                      min_confidence, multi_label, text extraction options).
+            **kwargs: Additional arguments passed down to the ClassificationManager.
         Returns:
             Self for method chaining.
         Raises:
             ValueError: If labels list is empty, or if using='vision' on a multi-page PDF.
-            ClassificationError: If classification fails for any PDF (will stop processing).
+            ClassificationError: If classification fails.
             ImportError: If classification dependencies are missing.
         """
-        PDF = self._get_pdf_class()
         if not labels:
             raise ValueError("Labels list cannot be empty.")
@@ -588,102 +582,103 @@ class PDFCollection(
         mode_desc = f"using='{using}'" if using else f"model='{model}'" if model else "default text"
         logger.info(
-            f"Starting classification for {len(self._pdfs)} PDFs in collection ({mode_desc})..."
+            f"Starting batch classification for {len(self._pdfs)} PDFs in collection ({mode_desc})..."
         )
-        progress_bar = tqdm(
-            total=len(self._pdfs), desc=f"Classifying PDFs ({mode_desc})", unit="pdf"
-        )
-        # Worker function
-        def _process_pdf_classification(pdf: PDF):
-            thread_id = threading.current_thread().name
-            pdf_path = pdf.path
-            logger.debug(f"[{thread_id}] Starting classification process for PDF: {pdf_path}")
-            start_time = time.monotonic()
+        # Get classification manager from first PDF
+        try:
+            first_pdf = self._pdfs[0]
+            if not hasattr(first_pdf, 'get_manager'):
+                raise RuntimeError("PDFs do not support classification manager")
+            manager = first_pdf.get_manager('classification')
+            if not manager or not manager.is_available():
+                raise RuntimeError("ClassificationManager is not available")
+        except Exception as e:
+            from natural_pdf.classification.manager import ClassificationError
+            raise ClassificationError(f"Cannot access ClassificationManager: {e}") from e
+        # Determine processing mode early
+        inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
+        # Gather content from all PDFs
+        pdf_contents = []
+        valid_pdfs = []
+        logger.info(f"Gathering content from {len(self._pdfs)} PDFs for batch classification...")
+        for pdf in self._pdfs:
             try:
-                # Call classify directly on the PDF object
-                pdf.classify(
-                    labels=labels,
-                    using=using,
-                    model=model,
-                    analysis_key=analysis_key,
-                    **kwargs,  # Pass other relevant args like min_confidence, multi_label
-                )
-                end_time = time.monotonic()
-                logger.debug(
-                    f"[{thread_id}] Finished classification for PDF: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
-                )
-                progress_bar.update(1)  # Update progress bar upon success
-                return pdf_path, None  # Return path and no error
-            except ValueError as ve:
-                # Catch specific error for vision on multi-page PDF
-                end_time = time.monotonic()
-                logger.error(
-                    f"[{thread_id}] Skipped classification for {pdf_path} after {end_time - start_time:.2f}s: {ve}",
-                    exc_info=False,
-                )
-                progress_bar.update(1)  # Still update progress bar
-                return pdf_path, ve  # Return the specific ValueError
+                # Get the content for classification - use the same logic as individual PDF classify
+                if inferred_using == "text":
+                    # Extract text content from PDF
+                    content = pdf.extract_text()
+                    if not content or content.isspace():
+                        logger.warning(f"Skipping PDF {pdf.path}: No text content found")
+                        continue
+                elif inferred_using == "vision":
+                    # For vision, we need single-page PDFs only
+                    if len(pdf.pages) != 1:
+                        logger.warning(f"Skipping PDF {pdf.path}: Vision classification requires single-page PDFs")
+                        continue
+                    # Get first page image
+                    content = pdf.pages[0].to_image()
+                else:
+                    raise ValueError(f"Unsupported using mode: {inferred_using}")
+                pdf_contents.append(content)
+                valid_pdfs.append(pdf)
             except Exception as e:
-                end_time = time.monotonic()
-                logger.error(
-                    f"[{thread_id}] Failed classification process for PDF {pdf_path} after {end_time - start_time:.2f}s: {e}",
-                    exc_info=True,  # Log full traceback for unexpected errors
-                )
-                # Close progress bar immediately on critical error to avoid hanging
-                if not progress_bar.disable:
-                    progress_bar.close()
-                # Re-raise the exception to stop the entire collection processing
-                raise ClassificationError(f"Classification failed for {pdf_path}: {e}") from e
+                logger.warning(f"Skipping PDF {pdf.path}: Error getting content - {e}")
+                continue
-        # Use ThreadPoolExecutor for parallel processing if max_workers > 1
-        processed_count = 0
-        skipped_count = 0
+        if not pdf_contents:
+            logger.warning("No valid content could be gathered from PDFs for classification.")
+            return self
+        logger.info(f"Gathered content from {len(valid_pdfs)} PDFs. Running batch classification...")
+        # Run batch classification
         try:
-            if max_workers is not None and max_workers > 1:
-                logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
-                futures = []
-                with concurrent.futures.ThreadPoolExecutor(
-                    max_workers=max_workers, thread_name_prefix="ClassifyWorker"
-                ) as executor:
-                    for pdf in self._pdfs:
-                        futures.append(executor.submit(_process_pdf_classification, pdf))
-                    # Wait for all futures to complete
-                    # Progress updated within worker
-                    for future in concurrent.futures.as_completed(futures):
-                        processed_count += 1
-                        pdf_path, error = (
-                            future.result()
-                        )  # Raise ClassificationError if worker failed critically
-                        if isinstance(error, ValueError):
-                            # Logged in worker, just count as skipped
-                            skipped_count += 1
-            else:  # Sequential processing
-                logger.info("Classifying PDFs sequentially.")
-                for pdf in self._pdfs:
-                    processed_count += 1
-                    pdf_path, error = _process_pdf_classification(
-                        pdf
-                    )  # Raise ClassificationError if worker failed critically
-                    if isinstance(error, ValueError):
-                        skipped_count += 1
-            final_message = (
-                f"Finished classification across the collection. Processed: {processed_count}"
+            batch_results = manager.classify_batch(
+                item_contents=pdf_contents,
+                labels=labels,
+                model_id=model,
+                using=inferred_using,
+                progress_bar=True,  # Let the manager handle progress display
+                **kwargs,
+            )
+        except Exception as e:
+            logger.error(f"Batch classification failed: {e}")
+            from natural_pdf.classification.manager import ClassificationError
+            raise ClassificationError(f"Batch classification failed: {e}") from e
+        # Assign results back to PDFs
+        if len(batch_results) != len(valid_pdfs):
+            logger.error(
+                f"Batch classification result count ({len(batch_results)}) mismatch "
+                f"with PDFs processed ({len(valid_pdfs)}). Cannot assign results."
             )
-            if skipped_count > 0:
-                final_message += f", Skipped (e.g., vision on multi-page): {skipped_count}"
-            logger.info(final_message + ".")
-        finally:
-            # Ensure progress bar is closed properly
-            if not progress_bar.disable and progress_bar.n < progress_bar.total:
-                progress_bar.n = progress_bar.total  # Ensure it reaches 100%
-            if not progress_bar.disable:
-                progress_bar.close()
+            from natural_pdf.classification.manager import ClassificationError
+            raise ClassificationError("Batch result count mismatch with input PDFs")
+        logger.info(f"Assigning {len(batch_results)} results to PDFs under key '{analysis_key}'.")
+        processed_count = 0
+        for pdf, result_obj in zip(valid_pdfs, batch_results):
+            try:
+                if not hasattr(pdf, "analyses") or pdf.analyses is None:
+                    pdf.analyses = {}
+                pdf.analyses[analysis_key] = result_obj
+                processed_count += 1
+            except Exception as e:
+                logger.warning(f"Failed to store classification result for {pdf.path}: {e}")
+        skipped_count = len(self._pdfs) - processed_count
+        final_message = f"Finished batch classification. Processed: {processed_count}"
+        if skipped_count > 0:
+            final_message += f", Skipped: {skipped_count}"
+        logger.info(final_message + ".")
         return self

natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl

natural-pdf 0.1.33py3-none-any.whl → 0.1.34py3-none-any.whl