PyPI - natural-pdf - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

natural-pdf 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

docs/categorizing-documents/index.md +168 -0
docs/data-extraction/index.md +87 -0
docs/element-selection/index.ipynb +218 -164
docs/element-selection/index.md +20 -0
docs/finetuning/index.md +176 -0
docs/index.md +19 -0
docs/ocr/index.md +63 -16
docs/tutorials/01-loading-and-extraction.ipynb +411 -248
docs/tutorials/02-finding-elements.ipynb +123 -46
docs/tutorials/03-extracting-blocks.ipynb +24 -19
docs/tutorials/04-table-extraction.ipynb +17 -12
docs/tutorials/05-excluding-content.ipynb +37 -32
docs/tutorials/06-document-qa.ipynb +36 -31
docs/tutorials/07-layout-analysis.ipynb +45 -40
docs/tutorials/07-working-with-regions.ipynb +61 -60
docs/tutorials/08-spatial-navigation.ipynb +76 -71
docs/tutorials/09-section-extraction.ipynb +160 -155
docs/tutorials/10-form-field-extraction.ipynb +71 -66
docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
docs/tutorials/12-ocr-integration.ipynb +3420 -312
docs/tutorials/12-ocr-integration.md +68 -106
docs/tutorials/13-semantic-search.ipynb +641 -251
natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/gemini.py +63 -47
natural_pdf/classification/manager.py +343 -0
natural_pdf/classification/mixin.py +149 -0
natural_pdf/classification/results.py +62 -0
natural_pdf/collections/mixins.py +63 -0
natural_pdf/collections/pdf_collection.py +326 -17
natural_pdf/core/element_manager.py +73 -4
natural_pdf/core/page.py +255 -83
natural_pdf/core/pdf.py +385 -367
natural_pdf/elements/base.py +1 -3
natural_pdf/elements/collections.py +279 -49
natural_pdf/elements/region.py +106 -21
natural_pdf/elements/text.py +5 -2
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/extraction/manager.py +134 -0
natural_pdf/extraction/mixin.py +246 -0
natural_pdf/extraction/result.py +37 -0
natural_pdf/ocr/__init__.py +16 -8
natural_pdf/ocr/engine.py +46 -30
natural_pdf/ocr/engine_easyocr.py +86 -42
natural_pdf/ocr/engine_paddle.py +39 -28
natural_pdf/ocr/engine_surya.py +32 -16
natural_pdf/ocr/ocr_factory.py +34 -23
natural_pdf/ocr/ocr_manager.py +98 -34
natural_pdf/ocr/ocr_options.py +38 -10
natural_pdf/ocr/utils.py +59 -33
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/selectors/parser.py +363 -238
natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
natural_pdf/utils/debug.py +4 -2
natural_pdf/utils/identifiers.py +9 -5
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +172 -105
natural_pdf/utils/text_extraction.py +96 -65
natural_pdf/utils/tqdm_utils.py +43 -0
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0

natural_pdf/__init__.py CHANGED Viewed

@@ -3,6 +3,8 @@ Natural PDF - A more intuitive interface for working with PDFs.
 """
 import logging
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # Create library logger
 logger = logging.getLogger("natural_pdf")
@@ -33,6 +35,7 @@ def configure_logging(level=logging.INFO, handler=None):
     logger.propagate = False
 from natural_pdf.core.page import Page
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.collections import ElementCollection

natural_pdf/analyzers/layout/gemini.py CHANGED Viewed

@@ -13,6 +13,7 @@ from PIL import Image
 try:
     from openai import OpenAI
     from openai.types.chat import ChatCompletion
     # Import OpenAIError for exception handling if needed
 except ImportError:
     OpenAI = None
@@ -32,7 +33,7 @@ except ImportError:
     class LayoutDetector:
         def __init__(self):
             self.logger = logging.getLogger()
-            self.supported_classes = set() # Will be dynamic based on user request
+            self.supported_classes = set()  # Will be dynamic based on user request
         def _get_model(self, options):
             raise NotImplementedError
@@ -41,17 +42,20 @@ except ImportError:
             return n.lower().replace("_", "-").replace(" ", "-")
         def validate_classes(self, c):
-            pass # Less strict validation needed for LLM
+            pass  # Less strict validation needed for LLM
     logging.basicConfig()
 logger = logging.getLogger(__name__)
 # Define Pydantic model for the expected output structure
 # This is used by the openai library's `response_format`
 class DetectedRegion(BaseModel):
     label: str = Field(description="The identified class name.")
-    bbox: List[float] = Field(description="Bounding box coordinates [xmin, ymin, xmax, ymax].", min_items=4, max_items=4)
+    bbox: List[float] = Field(
+        description="Bounding box coordinates [xmin, ymin, xmax, ymax].", min_items=4, max_items=4
+    )
     confidence: float = Field(description="Confidence score [0.0, 1.0].", ge=0.0, le=1.0)
@@ -63,23 +67,27 @@ class GeminiLayoutDetector(LayoutDetector):
     def __init__(self):
         super().__init__()
-        self.supported_classes = set() # Indicate dynamic nature
+        self.supported_classes = set()  # Indicate dynamic nature
     def is_available(self) -> bool:
         """Check if openai library is installed and GOOGLE_API_KEY is available."""
         api_key = os.environ.get("GOOGLE_API_KEY")
         if not api_key:
-            logger.warning("GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.")
+            logger.warning(
+                "GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available."
+            )
             return False
         if OpenAI is None:
-             logger.warning("openai package not found. Gemini detector (via OpenAI lib) will not be available.")
-             return False
+            logger.warning(
+                "openai package not found. Gemini detector (via OpenAI lib) will not be available."
+            )
+            return False
         return True
     def _get_cache_key(self, options: GeminiLayoutOptions) -> str:
         """Generate cache key based on model name."""
         if not isinstance(options, GeminiLayoutOptions):
-            options = GeminiLayoutOptions() # Use defaults
+            options = GeminiLayoutOptions()  # Use defaults
         model_key = options.model_name
         # Prompt is built dynamically, so not part of cache key based on options
@@ -101,9 +109,7 @@ class GeminiLayoutDetector(LayoutDetector):
     def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
         """Detect layout elements in an image using Gemini via OpenAI library."""
         if not self.is_available():
-            raise RuntimeError(
-                "OpenAI library not installed or GOOGLE_API_KEY not set."
-            )
+            raise RuntimeError("OpenAI library not installed or GOOGLE_API_KEY not set.")
         # Ensure options are the correct type
         if not isinstance(options, GeminiLayoutOptions):
@@ -124,10 +130,7 @@ class GeminiLayoutDetector(LayoutDetector):
         detections = []
         try:
             # --- 1. Initialize OpenAI Client for Gemini ---
-            client = OpenAI(
-                api_key=api_key,
-                base_url=self.GEMINI_BASE_URL
-            )
+            client = OpenAI(api_key=api_key, base_url=self.GEMINI_BASE_URL)
             # --- 2. Prepare Input for OpenAI API ---
             if not options.classes:
@@ -139,11 +142,11 @@ class GeminiLayoutDetector(LayoutDetector):
             # Convert image to base64
             buffered = io.BytesIO()
             image.save(buffered, format="PNG")
-            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
             image_url = f"data:image/png;base64,{img_base64}"
             # Construct the prompt text
-            class_list_str = ", ".join(f'`{c}`' for c in options.classes)
+            class_list_str = ", ".join(f"`{c}`" for c in options.classes)
             prompt_text = (
                 f"Analyze the provided image of a document page ({width}x{height}). "
                 f"Identify all regions corresponding to the following types: {class_list_str}. "
@@ -165,14 +168,18 @@ class GeminiLayoutDetector(LayoutDetector):
             ]
             # --- 3. Call OpenAI API using .parse for structured output ---
-            logger.debug(f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {options.classes}")
+            logger.debug(
+                f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {options.classes}"
+            )
             # Extract relevant generation parameters from extra_args if provided
             # Mapping common names: temperature, top_p, max_tokens
             completion_kwargs = {
-                "temperature": options.extra_args.get("temperature", 0.2), # Default to low temp
+                "temperature": options.extra_args.get("temperature", 0.2),  # Default to low temp
                 "top_p": options.extra_args.get("top_p"),
-                "max_tokens": options.extra_args.get("max_tokens", 4096), # Map from max_output_tokens
+                "max_tokens": options.extra_args.get(
+                    "max_tokens", 4096
+                ),  # Map from max_output_tokens
             }
             # Filter out None values
             completion_kwargs = {k: v for k, v in completion_kwargs.items() if v is not None}
@@ -180,13 +187,13 @@ class GeminiLayoutDetector(LayoutDetector):
             completion: ChatCompletion = client.beta.chat.completions.parse(
                 model=model_name,
                 messages=messages,
-                response_format=List[DetectedRegion], # Pass the Pydantic model list
-                **completion_kwargs
+                response_format=List[DetectedRegion],  # Pass the Pydantic model list
+                **completion_kwargs,
             )
             logger.debug(f"Gemini response received via OpenAI lib.")
-            # --- 4. Process Parsed Response ---
+            # --- 4. Process Parsed Response ---
             if not completion.choices:
                 logger.error("Gemini response (via OpenAI lib) contained no choices.")
                 return []
@@ -194,16 +201,18 @@ class GeminiLayoutDetector(LayoutDetector):
             # Get the parsed Pydantic objects
             parsed_results = completion.choices[0].message.parsed
             if not parsed_results or not isinstance(parsed_results, list):
-                 logger.error(f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}")
-                 return []
+                logger.error(
+                    f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}"
+                )
+                return []
-            # --- 5. Convert to Detections & Filter ---
-            normalized_classes_req = {
-                self._normalize_class_name(c) for c in options.classes
-            }
-            normalized_classes_excl = {
-                self._normalize_class_name(c) for c in options.exclude_classes
-            } if options.exclude_classes else set()
+            # --- 5. Convert to Detections & Filter ---
+            normalized_classes_req = {self._normalize_class_name(c) for c in options.classes}
+            normalized_classes_excl = (
+                {self._normalize_class_name(c) for c in options.exclude_classes}
+                if options.exclude_classes
+                else set()
+            )
             for item in parsed_results:
                 # The item is already a validated DetectedRegion Pydantic object
@@ -215,33 +224,41 @@ class GeminiLayoutDetector(LayoutDetector):
                 # Coordinates should already be floats, but ensure tuple format
                 xmin, ymin, xmax, ymax = tuple(bbox_raw)
-                # --- Apply Filtering ---
+                # --- Apply Filtering ---
                 normalized_class = self._normalize_class_name(label)
                 # Check against requested classes (Should be guaranteed by schema, but doesn't hurt)
                 if normalized_class not in normalized_classes_req:
-                    logger.warning(f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping.")
+                    logger.warning(
+                        f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping."
+                    )
                     continue
                 # Check against excluded classes
                 if normalized_class in normalized_classes_excl:
-                    logger.debug(f"Skipping excluded class '{label}' (normalized: {normalized_class}).")
+                    logger.debug(
+                        f"Skipping excluded class '{label}' (normalized: {normalized_class})."
+                    )
                     continue
                 # Check against base confidence threshold from options
                 if confidence_score < options.confidence:
-                    logger.debug(f"Skipping item with confidence {confidence_score:.3f} below threshold {options.confidence}.")
+                    logger.debug(
+                        f"Skipping item with confidence {confidence_score:.3f} below threshold {options.confidence}."
+                    )
                     continue
                 # Add detection
-                detections.append({
-                    "bbox": (xmin, ymin, xmax, ymax),
-                    "class": label, # Use original label from LLM
-                    "confidence": confidence_score,
-                    "normalized_class": normalized_class,
-                    "source": "layout",
-                    "model": "gemini", # Keep model name generic as gemini
-                })
+                detections.append(
+                    {
+                        "bbox": (xmin, ymin, xmax, ymax),
+                        "class": label,  # Use original label from LLM
+                        "confidence": confidence_score,
+                        "normalized_class": normalized_class,
+                        "source": "layout",
+                        "model": "gemini",  # Keep model name generic as gemini
+                    }
+                )
             self.logger.info(
                 f"Gemini (via OpenAI lib) processed response. Detected {len(detections)} layout elements matching criteria."
@@ -260,5 +277,4 @@ class GeminiLayoutDetector(LayoutDetector):
     def validate_classes(self, classes: List[str]):
         """Validation is less critical as we pass requested classes to the LLM."""
-        pass # Override base validation if needed, but likely not necessary
+        pass  # Override base validation if needed, but likely not necessary

natural_pdf/classification/manager.py ADDED Viewed

@@ -0,0 +1,343 @@
+import logging
+import time
+from datetime import datetime
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, Tuple
+# Use try-except for robustness if dependencies are missing
+try:
+    import torch
+    from PIL import Image
+    from transformers import pipeline, AutoTokenizer, AutoModelForZeroShotImageClassification, AutoModelForSequenceClassification
+    _CLASSIFICATION_AVAILABLE = True
+except ImportError:
+    _CLASSIFICATION_AVAILABLE = False
+    # Define dummy types for type hinting if imports fail
+    Image = type("Image", (), {})
+    pipeline = object
+    AutoTokenizer = object
+    AutoModelForZeroShotImageClassification = object
+    AutoModelForSequenceClassification = object
+    torch = None
+# Import result classes
+from .results import ClassificationResult, CategoryScore
+from natural_pdf.utils.tqdm_utils import get_tqdm
+if TYPE_CHECKING:
+    from transformers import Pipeline
+logger = logging.getLogger(__name__)
+# Global cache for models/pipelines
+_PIPELINE_CACHE: Dict[str, "Pipeline"] = {}
+_TOKENIZER_CACHE: Dict[str, Any] = {}
+_MODEL_CACHE: Dict[str, Any] = {}
+class ClassificationError(Exception):
+    """Custom exception for classification errors."""
+    pass
+class ClassificationManager:
+    """Manages classification models and execution."""
+    DEFAULT_TEXT_MODEL = "facebook/bart-large-mnli"
+    DEFAULT_VISION_MODEL = "openai/clip-vit-base-patch16"
+    def __init__(
+        self,
+        model_mapping: Optional[Dict[str, str]] = None,
+        default_device: Optional[str] = None,
+    ):
+        """
+        Initialize the ClassificationManager.
+        Args:
+            model_mapping: Optional dictionary mapping aliases ('text', 'vision') to model IDs.
+            default_device: Default device ('cpu', 'cuda') if not specified in classify calls.
+        """
+        if not _CLASSIFICATION_AVAILABLE:
+            raise ImportError(
+                "Classification dependencies missing. "
+                "Install with: pip install \"natural-pdf[classification]\""
+            )
+        self.pipelines: Dict[Tuple[str, str], "Pipeline"] = {} # Cache: (model_id, device) -> pipeline
+        self.device = default_device
+        logger.info(f"ClassificationManager initialized on device: {self.device}")
+    def is_available(self) -> bool:
+        """Check if required dependencies are installed."""
+        return _CLASSIFICATION_AVAILABLE
+    def _get_pipeline(self, model_id: str, using: str) -> "Pipeline":
+        """Get or create a classification pipeline."""
+        cache_key = f"{model_id}_{using}_{self.device}"
+        if cache_key not in _PIPELINE_CACHE:
+            logger.info(f"Loading {using} classification pipeline for model '{model_id}' on device '{self.device}'...")
+            start_time = time.time()
+            try:
+                task = (
+                    "zero-shot-classification"
+                    if using == "text"
+                    else "zero-shot-image-classification"
+                )
+                _PIPELINE_CACHE[cache_key] = pipeline(
+                    task,
+                    model=model_id,
+                    device=self.device
+                )
+                end_time = time.time()
+                logger.info(f"Pipeline for '{model_id}' loaded in {end_time - start_time:.2f} seconds.")
+            except Exception as e:
+                logger.error(f"Failed to load pipeline for model '{model_id}' (using: {using}): {e}", exc_info=True)
+                raise ClassificationError(f"Failed to load pipeline for model '{model_id}'. Ensure the model ID is correct and supports the {task} task.") from e
+        return _PIPELINE_CACHE[cache_key]
+    def infer_using(self, model_id: str, using: Optional[str] = None) -> str:
+        """Infers processing mode ('text' or 'vision') if not provided."""
+        if using in ["text", "vision"]:
+            return using
+        # Simple inference based on common model names
+        normalized_model_id = model_id.lower()
+        if "clip" in normalized_model_id or "vit" in normalized_model_id or "siglip" in normalized_model_id:
+             logger.debug(f"Inferred using='vision' for model '{model_id}'")
+             return "vision"
+        if "bart" in normalized_model_id or "bert" in normalized_model_id or "mnli" in normalized_model_id or "xnli" in normalized_model_id or "deberta" in normalized_model_id:
+             logger.debug(f"Inferred using='text' for model '{model_id}'")
+             return "text"
+        # Fallback or raise error? Let's try loading text first, then vision.
+        logger.warning(f"Could not reliably infer mode for '{model_id}'. Trying text, then vision pipeline loading.")
+        try:
+            self._get_pipeline(model_id, "text")
+            logger.info(f"Successfully loaded '{model_id}' as a text model.")
+            return "text"
+        except Exception:
+             logger.warning(f"Failed to load '{model_id}' as text model. Trying vision.")
+             try:
+                 self._get_pipeline(model_id, "vision")
+                 logger.info(f"Successfully loaded '{model_id}' as a vision model.")
+                 return "vision"
+             except Exception as e_vision:
+                 logger.error(f"Failed to load '{model_id}' as either text or vision model.", exc_info=True)
+                 raise ClassificationError(f"Cannot determine mode for model '{model_id}'. Please specify `using='text'` or `using='vision'`. Error: {e_vision}")
+    def classify_item(
+        self,
+        item_content: Union[str, Image.Image],
+        categories: List[str],
+        model_id: Optional[str] = None,
+        using: Optional[str] = None,
+        min_confidence: float = 0.0,
+        multi_label: bool = False,
+        **kwargs
+    ) -> ClassificationResult: # Return ClassificationResult
+        """Classifies a single item (text or image)."""
+        # Determine model and engine type
+        effective_using = using
+        if model_id is None:
+             # Try inferring based on content type
+             if isinstance(item_content, str):
+                 effective_using = "text"
+                 model_id = self.DEFAULT_TEXT_MODEL
+             elif isinstance(item_content, Image.Image):
+                 effective_using = "vision"
+                 model_id = self.DEFAULT_VISION_MODEL
+             else:
+                 raise TypeError(f"Unsupported item_content type: {type(item_content)}")
+        else:
+             # Infer engine type if not given
+             effective_using = self.infer_using(model_id, using)
+             # Set default model if needed (though should usually be provided if engine known)
+             if model_id is None:
+                  model_id = self.DEFAULT_TEXT_MODEL if effective_using == "text" else self.DEFAULT_VISION_MODEL
+        if not categories:
+             raise ValueError("Categories list cannot be empty.")
+        pipeline_instance = self._get_pipeline(model_id, effective_using)
+        timestamp = datetime.now()
+        parameters = { # Store parameters used for this run
+            'categories': categories,
+            'model_id': model_id,
+            'using': effective_using,
+            'min_confidence': min_confidence,
+            'multi_label': multi_label,
+            **kwargs
+        }
+        logger.debug(f"Classifying content (type: {type(item_content).__name__}) with model '{model_id}'")
+        try:
+            # Handle potential kwargs for specific pipelines if needed
+            # The zero-shot pipelines expect `candidate_labels`
+            result_raw = pipeline_instance(item_content, candidate_labels=categories, multi_label=multi_label, **kwargs)
+            logger.debug(f"Raw pipeline result: {result_raw}")
+            # --- Process raw result into ClassificationResult --- #
+            scores_list: List[CategoryScore] = []
+            # Handle text pipeline format (dict with 'labels' and 'scores')
+            if isinstance(result_raw, dict) and 'labels' in result_raw and 'scores' in result_raw:
+                for label, score_val in zip(result_raw['labels'], result_raw['scores']):
+                     if score_val >= min_confidence:
+                          try:
+                              scores_list.append(CategoryScore(label=label, confidence=score_val))
+                          except (ValueError, TypeError) as score_err:
+                               logger.warning(f"Skipping invalid score from text pipeline: label='{label}', score={score_val}. Error: {score_err}")
+            # Handle vision pipeline format (list of dicts with 'label' and 'score')
+            elif isinstance(result_raw, list) and all(isinstance(item, dict) and 'label' in item and 'score' in item for item in result_raw):
+                 for item in result_raw:
+                      score_val = item['score']
+                      label = item['label']
+                      if score_val >= min_confidence:
+                           try:
+                               scores_list.append(CategoryScore(label=label, confidence=score_val))
+                           except (ValueError, TypeError) as score_err:
+                                logger.warning(f"Skipping invalid score from vision pipeline: label='{label}', score={score_val}. Error: {score_err}")
+            else:
+                 logger.warning(f"Unexpected raw result format from pipeline for model '{model_id}': {type(result_raw)}. Cannot extract scores.")
+                 # Return empty result?
+                 # scores_list = []
+            return ClassificationResult(
+                model_id=model_id,
+                using=effective_using,
+                timestamp=timestamp,
+                parameters=parameters,
+                scores=scores_list
+            )
+            # --- End Processing --- #
+        except Exception as e:
+             logger.error(f"Classification failed for model '{model_id}': {e}", exc_info=True)
+             # Return an empty result object on failure?
+             # return ClassificationResult(model_id=model_id, engine_type=engine_type, timestamp=timestamp, parameters=parameters, scores=[])
+             raise ClassificationError(f"Classification failed using model '{model_id}'. Error: {e}") from e
+    def classify_batch(
+        self,
+        item_contents: List[Union[str, Image.Image]],
+        categories: List[str],
+        model_id: Optional[str] = None,
+        using: Optional[str] = None,
+        min_confidence: float = 0.0,
+        multi_label: bool = False,
+        batch_size: int = 8,
+        progress_bar: bool = True,
+        **kwargs
+    ) -> List[ClassificationResult]: # Return list of ClassificationResult
+        """Classifies a batch of items (text or image) using the pipeline's batching."""
+        if not item_contents:
+             return []
+        # Determine model and engine type (assuming uniform type in batch)
+        first_item = item_contents[0]
+        effective_using = using
+        if model_id is None:
+             if isinstance(first_item, str):
+                 effective_using = "text"
+                 model_id = self.DEFAULT_TEXT_MODEL
+             elif isinstance(first_item, Image.Image):
+                 effective_using = "vision"
+                 model_id = self.DEFAULT_VISION_MODEL
+             else:
+                 raise TypeError(f"Unsupported item_content type in batch: {type(first_item)}")
+        else:
+             effective_using = self.infer_using(model_id, using)
+             if model_id is None:
+                  model_id = self.DEFAULT_TEXT_MODEL if effective_using == "text" else self.DEFAULT_VISION_MODEL
+        if not categories:
+             raise ValueError("Categories list cannot be empty.")
+        pipeline_instance = self._get_pipeline(model_id, effective_using)
+        timestamp = datetime.now() # Single timestamp for the batch run
+        parameters = { # Parameters for the whole batch
+            'categories': categories,
+            'model_id': model_id,
+            'using': effective_using,
+            'min_confidence': min_confidence,
+            'multi_label': multi_label,
+            'batch_size': batch_size,
+            **kwargs
+        }
+        logger.info(f"Classifying batch of {len(item_contents)} items with model '{model_id}' (batch size: {batch_size})")
+        batch_results_list: List[ClassificationResult] = []
+        try:
+            # Use pipeline directly for batching
+            results_iterator = pipeline_instance(
+                item_contents,
+                candidate_labels=categories,
+                multi_label=multi_label,
+                batch_size=batch_size,
+                **kwargs
+            )
+            # Wrap with tqdm for progress if requested
+            total_items = len(item_contents)
+            if progress_bar:
+                 # Get the appropriate tqdm class
+                 tqdm_class = get_tqdm()
+                 results_iterator = tqdm_class(
+                      results_iterator,
+                      total=total_items,
+                      desc=f"Classifying batch ({model_id})",
+                      leave=False # Don't leave progress bar hanging
+                 )
+            for raw_result in results_iterator:
+                # --- Process each raw result (which corresponds to ONE input item) --- #
+                scores_list: List[CategoryScore] = []
+                try:
+                    # Check for text format (dict with 'labels' and 'scores')
+                    if isinstance(raw_result, dict) and 'labels' in raw_result and 'scores' in raw_result:
+                        for label, score_val in zip(raw_result['labels'], raw_result['scores']):
+                             if score_val >= min_confidence:
+                                 try:
+                                     scores_list.append(CategoryScore(label=label, confidence=score_val))
+                                 except (ValueError, TypeError) as score_err:
+                                      logger.warning(f"Skipping invalid score from text pipeline batch: label='{label}', score={score_val}. Error: {score_err}")
+                    # Check for vision format (list of dicts with 'label' and 'score')
+                    elif isinstance(raw_result, list):
+                         for item in raw_result:
+                              try:
+                                  score_val = item['score']
+                                  label = item['label']
+                                  if score_val >= min_confidence:
+                                      scores_list.append(CategoryScore(label=label, confidence=score_val))
+                              except (KeyError, ValueError, TypeError) as item_err:
+                                   logger.warning(f"Skipping invalid item in vision result list from batch: {item}. Error: {item_err}")
+                    else:
+                         logger.warning(f"Unexpected raw result format in batch item from model '{model_id}': {type(raw_result)}. Cannot extract scores.")
+                except Exception as proc_err:
+                     logger.error(f"Error processing result item in batch: {proc_err}", exc_info=True)
+                     # scores_list remains empty for this item
+                # Append result object for this item
+                batch_results_list.append(ClassificationResult(
+                     model_id=model_id,
+                     using=effective_using,
+                     timestamp=timestamp, # Use same timestamp for batch
+                     parameters=parameters, # Use same params for batch
+                     scores=scores_list
+                ))
+                # --- End Processing --- #
+            if len(batch_results_list) != total_items:
+                 logger.warning(f"Batch classification returned {len(batch_results_list)} results, but expected {total_items}. Results might be incomplete or misaligned.")
+            return batch_results_list
+        except Exception as e:
+             logger.error(f"Batch classification failed for model '{model_id}': {e}", exc_info=True)
+             # Return list of empty results?
+             # return [ClassificationResult(model_id=model_id, s=engine_type, timestamp=timestamp, parameters=parameters, scores=[]) for _ in item_contents]
+             raise ClassificationError(f"Batch classification failed using model '{model_id}'. Error: {e}") from e

natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

natural-pdf 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl