PyPI - natural-pdf - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

natural-pdf 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +241 -158
natural_pdf/classification/mixin.py +52 -38
natural_pdf/classification/results.py +71 -45
natural_pdf/collections/mixins.py +85 -20
natural_pdf/collections/pdf_collection.py +245 -100
natural_pdf/core/element_manager.py +30 -14
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +423 -101
natural_pdf/core/pdf.py +694 -195
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +610 -134
natural_pdf/elements/region.py +659 -90
natural_pdf/elements/text.py +1 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +4 -3
natural_pdf/extraction/manager.py +50 -49
natural_pdf/extraction/mixin.py +90 -57
natural_pdf/extraction/result.py +9 -23
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +61 -25
natural_pdf/ocr/ocr_options.py +70 -10
natural_pdf/ocr/utils.py +6 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +219 -143
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +1 -1
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +24 -16
natural_pdf/utils/tqdm_utils.py +18 -10
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
natural_pdf-0.1.10.dist-info/RECORD +80 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/categorizing-documents/index.md +0 -168
docs/data-extraction/index.md +0 -87
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -969
docs/element-selection/index.md +0 -249
docs/finetuning/index.md +0 -176
docs/index.md +0 -189
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -256
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -417
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -152
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -119
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -275
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -337
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -293
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -414
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -513
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2439
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -517
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -3712
docs/tutorials/12-ocr-integration.md +0 -137
docs/tutorials/13-semantic-search.ipynb +0 -1718
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.8.dist-info/RECORD +0 -156
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0

natural_pdf/classification/manager.py CHANGED Viewed

@@ -1,28 +1,35 @@
 import logging
 import time
 from datetime import datetime
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from PIL import Image
 # Use try-except for robustness if dependencies are missing
 try:
     import torch
-    from PIL import Image
-    from transformers import pipeline, AutoTokenizer, AutoModelForZeroShotImageClassification, AutoModelForSequenceClassification
+    from transformers import (
+        AutoModelForSequenceClassification,
+        AutoModelForZeroShotImageClassification,
+        AutoTokenizer,
+        pipeline,
+    )
     _CLASSIFICATION_AVAILABLE = True
 except ImportError:
     _CLASSIFICATION_AVAILABLE = False
     # Define dummy types for type hinting if imports fail
-    Image = type("Image", (), {})
     pipeline = object
     AutoTokenizer = object
     AutoModelForZeroShotImageClassification = object
     AutoModelForSequenceClassification = object
     torch = None
-# Import result classes
-from .results import ClassificationResult, CategoryScore
 from natural_pdf.utils.tqdm_utils import get_tqdm
+# Import result classes
+from .results import CategoryScore, ClassificationResult
 if TYPE_CHECKING:
     from transformers import Pipeline
@@ -34,8 +41,10 @@ _PIPELINE_CACHE: Dict[str, "Pipeline"] = {}
 _TOKENIZER_CACHE: Dict[str, Any] = {}
 _MODEL_CACHE: Dict[str, Any] = {}
 class ClassificationError(Exception):
     """Custom exception for classification errors."""
     pass
@@ -60,10 +69,12 @@ class ClassificationManager:
         if not _CLASSIFICATION_AVAILABLE:
             raise ImportError(
                 "Classification dependencies missing. "
-                "Install with: pip install \"natural-pdf[classification]\""
+                'Install with: pip install "natural-pdf[classification]"'
             )
-        self.pipelines: Dict[Tuple[str, str], "Pipeline"] = {} # Cache: (model_id, device) -> pipeline
+        self.pipelines: Dict[Tuple[str, str], "Pipeline"] = (
+            {}
+        )  # Cache: (model_id, device) -> pipeline
         self.device = default_device
         logger.info(f"ClassificationManager initialized on device: {self.device}")
@@ -76,7 +87,9 @@ class ClassificationManager:
         """Get or create a classification pipeline."""
         cache_key = f"{model_id}_{using}_{self.device}"
         if cache_key not in _PIPELINE_CACHE:
-            logger.info(f"Loading {using} classification pipeline for model '{model_id}' on device '{self.device}'...")
+            logger.info(
+                f"Loading {using} classification pipeline for model '{model_id}' on device '{self.device}'..."
+            )
             start_time = time.time()
             try:
                 task = (
@@ -84,16 +97,19 @@ class ClassificationManager:
                     if using == "text"
                     else "zero-shot-image-classification"
                 )
-                _PIPELINE_CACHE[cache_key] = pipeline(
-                    task,
-                    model=model_id,
-                    device=self.device
-                )
+                _PIPELINE_CACHE[cache_key] = pipeline(task, model=model_id, device=self.device)
                 end_time = time.time()
-                logger.info(f"Pipeline for '{model_id}' loaded in {end_time - start_time:.2f} seconds.")
+                logger.info(
+                    f"Pipeline for '{model_id}' loaded in {end_time - start_time:.2f} seconds."
+                )
             except Exception as e:
-                logger.error(f"Failed to load pipeline for model '{model_id}' (using: {using}): {e}", exc_info=True)
-                raise ClassificationError(f"Failed to load pipeline for model '{model_id}'. Ensure the model ID is correct and supports the {task} task.") from e
+                logger.error(
+                    f"Failed to load pipeline for model '{model_id}' (using: {using}): {e}",
+                    exc_info=True,
+                )
+                raise ClassificationError(
+                    f"Failed to load pipeline for model '{model_id}'. Ensure the model ID is correct and supports the {task} task."
+                ) from e
         return _PIPELINE_CACHE[cache_key]
     def infer_using(self, model_id: str, using: Optional[str] = None) -> str:
@@ -103,241 +119,308 @@ class ClassificationManager:
         # Simple inference based on common model names
         normalized_model_id = model_id.lower()
-        if "clip" in normalized_model_id or "vit" in normalized_model_id or "siglip" in normalized_model_id:
-             logger.debug(f"Inferred using='vision' for model '{model_id}'")
-             return "vision"
-        if "bart" in normalized_model_id or "bert" in normalized_model_id or "mnli" in normalized_model_id or "xnli" in normalized_model_id or "deberta" in normalized_model_id:
-             logger.debug(f"Inferred using='text' for model '{model_id}'")
-             return "text"
+        if (
+            "clip" in normalized_model_id
+            or "vit" in normalized_model_id
+            or "siglip" in normalized_model_id
+        ):
+            logger.debug(f"Inferred using='vision' for model '{model_id}'")
+            return "vision"
+        if (
+            "bart" in normalized_model_id
+            or "bert" in normalized_model_id
+            or "mnli" in normalized_model_id
+            or "xnli" in normalized_model_id
+            or "deberta" in normalized_model_id
+        ):
+            logger.debug(f"Inferred using='text' for model '{model_id}'")
+            return "text"
         # Fallback or raise error? Let's try loading text first, then vision.
-        logger.warning(f"Could not reliably infer mode for '{model_id}'. Trying text, then vision pipeline loading.")
+        logger.warning(
+            f"Could not reliably infer mode for '{model_id}'. Trying text, then vision pipeline loading."
+        )
         try:
             self._get_pipeline(model_id, "text")
             logger.info(f"Successfully loaded '{model_id}' as a text model.")
             return "text"
         except Exception:
-             logger.warning(f"Failed to load '{model_id}' as text model. Trying vision.")
-             try:
-                 self._get_pipeline(model_id, "vision")
-                 logger.info(f"Successfully loaded '{model_id}' as a vision model.")
-                 return "vision"
-             except Exception as e_vision:
-                 logger.error(f"Failed to load '{model_id}' as either text or vision model.", exc_info=True)
-                 raise ClassificationError(f"Cannot determine mode for model '{model_id}'. Please specify `using='text'` or `using='vision'`. Error: {e_vision}")
+            logger.warning(f"Failed to load '{model_id}' as text model. Trying vision.")
+            try:
+                self._get_pipeline(model_id, "vision")
+                logger.info(f"Successfully loaded '{model_id}' as a vision model.")
+                return "vision"
+            except Exception as e_vision:
+                logger.error(
+                    f"Failed to load '{model_id}' as either text or vision model.", exc_info=True
+                )
+                raise ClassificationError(
+                    f"Cannot determine mode for model '{model_id}'. Please specify `using='text'` or `using='vision'`. Error: {e_vision}"
+                )
     def classify_item(
         self,
         item_content: Union[str, Image.Image],
-        categories: List[str],
+        labels: List[str],
         model_id: Optional[str] = None,
         using: Optional[str] = None,
         min_confidence: float = 0.0,
         multi_label: bool = False,
-        **kwargs
-    ) -> ClassificationResult: # Return ClassificationResult
+        **kwargs,
+    ) -> ClassificationResult:  # Return ClassificationResult
         """Classifies a single item (text or image)."""
         # Determine model and engine type
         effective_using = using
         if model_id is None:
-             # Try inferring based on content type
-             if isinstance(item_content, str):
-                 effective_using = "text"
-                 model_id = self.DEFAULT_TEXT_MODEL
-             elif isinstance(item_content, Image.Image):
-                 effective_using = "vision"
-                 model_id = self.DEFAULT_VISION_MODEL
-             else:
-                 raise TypeError(f"Unsupported item_content type: {type(item_content)}")
+            # Try inferring based on content type
+            if isinstance(item_content, str):
+                effective_using = "text"
+                model_id = self.DEFAULT_TEXT_MODEL
+            elif isinstance(item_content, Image.Image):
+                effective_using = "vision"
+                model_id = self.DEFAULT_VISION_MODEL
+            else:
+                raise TypeError(f"Unsupported item_content type: {type(item_content)}")
         else:
-             # Infer engine type if not given
-             effective_using = self.infer_using(model_id, using)
-             # Set default model if needed (though should usually be provided if engine known)
-             if model_id is None:
-                  model_id = self.DEFAULT_TEXT_MODEL if effective_using == "text" else self.DEFAULT_VISION_MODEL
+            # Infer engine type if not given
+            effective_using = self.infer_using(model_id, using)
+            # Set default model if needed (though should usually be provided if engine known)
+            if model_id is None:
+                model_id = (
+                    self.DEFAULT_TEXT_MODEL
+                    if effective_using == "text"
+                    else self.DEFAULT_VISION_MODEL
+                )
-        if not categories:
-             raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
         pipeline_instance = self._get_pipeline(model_id, effective_using)
         timestamp = datetime.now()
-        parameters = { # Store parameters used for this run
-            'categories': categories,
-            'model_id': model_id,
-            'using': effective_using,
-            'min_confidence': min_confidence,
-            'multi_label': multi_label,
-            **kwargs
+        parameters = {  # Store parameters used for this run
+            "labels": labels,
+            "model_id": model_id,
+            "using": effective_using,
+            "min_confidence": min_confidence,
+            "multi_label": multi_label,
+            **kwargs,
         }
-        logger.debug(f"Classifying content (type: {type(item_content).__name__}) with model '{model_id}'")
+        logger.debug(
+            f"Classifying content (type: {type(item_content).__name__}) with model '{model_id}'"
+        )
         try:
             # Handle potential kwargs for specific pipelines if needed
             # The zero-shot pipelines expect `candidate_labels`
-            result_raw = pipeline_instance(item_content, candidate_labels=categories, multi_label=multi_label, **kwargs)
+            result_raw = pipeline_instance(
+                item_content, candidate_labels=labels, multi_label=multi_label, **kwargs
+            )
             logger.debug(f"Raw pipeline result: {result_raw}")
-            # --- Process raw result into ClassificationResult --- #
+            # --- Process raw result into ClassificationResult --- #
             scores_list: List[CategoryScore] = []
             # Handle text pipeline format (dict with 'labels' and 'scores')
-            if isinstance(result_raw, dict) and 'labels' in result_raw and 'scores' in result_raw:
-                for label, score_val in zip(result_raw['labels'], result_raw['scores']):
-                     if score_val >= min_confidence:
-                          try:
-                              scores_list.append(CategoryScore(label=label, confidence=score_val))
-                          except (ValueError, TypeError) as score_err:
-                               logger.warning(f"Skipping invalid score from text pipeline: label='{label}', score={score_val}. Error: {score_err}")
+            if isinstance(result_raw, dict) and "labels" in result_raw and "scores" in result_raw:
+                for label, score_val in zip(result_raw["labels"], result_raw["scores"]):
+                    if score_val >= min_confidence:
+                        try:
+                            scores_list.append(CategoryScore(label, score_val))
+                        except (ValueError, TypeError) as score_err:
+                            logger.warning(
+                                f"Skipping invalid score from text pipeline: label='{label}', score={score_val}. Error: {score_err}"
+                            )
             # Handle vision pipeline format (list of dicts with 'label' and 'score')
-            elif isinstance(result_raw, list) and all(isinstance(item, dict) and 'label' in item and 'score' in item for item in result_raw):
-                 for item in result_raw:
-                      score_val = item['score']
-                      label = item['label']
-                      if score_val >= min_confidence:
-                           try:
-                               scores_list.append(CategoryScore(label=label, confidence=score_val))
-                           except (ValueError, TypeError) as score_err:
-                                logger.warning(f"Skipping invalid score from vision pipeline: label='{label}', score={score_val}. Error: {score_err}")
+            elif isinstance(result_raw, list) and all(
+                isinstance(item, dict) and "label" in item and "score" in item
+                for item in result_raw
+            ):
+                for item in result_raw:
+                    score_val = item["score"]
+                    label = item["label"]
+                    if score_val >= min_confidence:
+                        try:
+                            scores_list.append(CategoryScore(label, score_val))
+                        except (ValueError, TypeError) as score_err:
+                            logger.warning(
+                                f"Skipping invalid score from vision pipeline: label='{label}', score={score_val}. Error: {score_err}"
+                            )
             else:
-                 logger.warning(f"Unexpected raw result format from pipeline for model '{model_id}': {type(result_raw)}. Cannot extract scores.")
-                 # Return empty result?
-                 # scores_list = []
+                logger.warning(
+                    f"Unexpected raw result format from pipeline for model '{model_id}': {type(result_raw)}. Cannot extract scores."
+                )
+                # Return empty result?
+                # scores_list = []
-            return ClassificationResult(
+            # ClassificationResult now calculates top score/category internally
+            result_obj = ClassificationResult(
+                scores=scores_list,  # Pass the filtered list
                 model_id=model_id,
                 using=effective_using,
-                timestamp=timestamp,
                 parameters=parameters,
-                scores=scores_list
+                timestamp=timestamp,
             )
+            return result_obj
             # --- End Processing --- #
         except Exception as e:
-             logger.error(f"Classification failed for model '{model_id}': {e}", exc_info=True)
-             # Return an empty result object on failure?
-             # return ClassificationResult(model_id=model_id, engine_type=engine_type, timestamp=timestamp, parameters=parameters, scores=[])
-             raise ClassificationError(f"Classification failed using model '{model_id}'. Error: {e}") from e
+            logger.error(f"Classification failed for model '{model_id}': {e}", exc_info=True)
+            # Return an empty result object on failure?
+            # return ClassificationResult(model_id=model_id, engine_type=engine_type, timestamp=timestamp, parameters=parameters, scores=[])
+            raise ClassificationError(
+                f"Classification failed using model '{model_id}'. Error: {e}"
+            ) from e
     def classify_batch(
         self,
         item_contents: List[Union[str, Image.Image]],
-        categories: List[str],
+        labels: List[str],
         model_id: Optional[str] = None,
         using: Optional[str] = None,
         min_confidence: float = 0.0,
         multi_label: bool = False,
         batch_size: int = 8,
         progress_bar: bool = True,
-        **kwargs
-    ) -> List[ClassificationResult]: # Return list of ClassificationResult
+        **kwargs,
+    ) -> List[ClassificationResult]:  # Return list of ClassificationResult
         """Classifies a batch of items (text or image) using the pipeline's batching."""
         if not item_contents:
-             return []
+            return []
         # Determine model and engine type (assuming uniform type in batch)
         first_item = item_contents[0]
         effective_using = using
         if model_id is None:
-             if isinstance(first_item, str):
-                 effective_using = "text"
-                 model_id = self.DEFAULT_TEXT_MODEL
-             elif isinstance(first_item, Image.Image):
-                 effective_using = "vision"
-                 model_id = self.DEFAULT_VISION_MODEL
-             else:
-                 raise TypeError(f"Unsupported item_content type in batch: {type(first_item)}")
+            if isinstance(first_item, str):
+                effective_using = "text"
+                model_id = self.DEFAULT_TEXT_MODEL
+            elif isinstance(first_item, Image.Image):
+                effective_using = "vision"
+                model_id = self.DEFAULT_VISION_MODEL
+            else:
+                raise TypeError(f"Unsupported item_content type in batch: {type(first_item)}")
         else:
-             effective_using = self.infer_using(model_id, using)
-             if model_id is None:
-                  model_id = self.DEFAULT_TEXT_MODEL if effective_using == "text" else self.DEFAULT_VISION_MODEL
+            effective_using = self.infer_using(model_id, using)
+            if model_id is None:
+                model_id = (
+                    self.DEFAULT_TEXT_MODEL
+                    if effective_using == "text"
+                    else self.DEFAULT_VISION_MODEL
+                )
-        if not categories:
-             raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
         pipeline_instance = self._get_pipeline(model_id, effective_using)
-        timestamp = datetime.now() # Single timestamp for the batch run
-        parameters = { # Parameters for the whole batch
-            'categories': categories,
-            'model_id': model_id,
-            'using': effective_using,
-            'min_confidence': min_confidence,
-            'multi_label': multi_label,
-            'batch_size': batch_size,
-            **kwargs
+        timestamp = datetime.now()  # Single timestamp for the batch run
+        parameters = {  # Parameters for the whole batch
+            "labels": labels,
+            "model_id": model_id,
+            "using": effective_using,
+            "min_confidence": min_confidence,
+            "multi_label": multi_label,
+            "batch_size": batch_size,
+            **kwargs,
         }
-        logger.info(f"Classifying batch of {len(item_contents)} items with model '{model_id}' (batch size: {batch_size})")
+        logger.info(
+            f"Classifying batch of {len(item_contents)} items with model '{model_id}' (batch size: {batch_size})"
+        )
         batch_results_list: List[ClassificationResult] = []
         try:
             # Use pipeline directly for batching
             results_iterator = pipeline_instance(
                 item_contents,
-                candidate_labels=categories,
+                candidate_labels=labels,
                 multi_label=multi_label,
                 batch_size=batch_size,
-                **kwargs
+                **kwargs,
             )
             # Wrap with tqdm for progress if requested
             total_items = len(item_contents)
             if progress_bar:
-                 # Get the appropriate tqdm class
-                 tqdm_class = get_tqdm()
-                 results_iterator = tqdm_class(
-                      results_iterator,
-                      total=total_items,
-                      desc=f"Classifying batch ({model_id})",
-                      leave=False # Don't leave progress bar hanging
-                 )
+                # Get the appropriate tqdm class
+                tqdm_class = get_tqdm()
+                results_iterator = tqdm_class(
+                    results_iterator,
+                    total=total_items,
+                    desc=f"Classifying batch ({model_id})",
+                    leave=False,  # Don't leave progress bar hanging
+                )
             for raw_result in results_iterator:
-                # --- Process each raw result (which corresponds to ONE input item) --- #
+                # --- Process each raw result (which corresponds to ONE input item) --- #
                 scores_list: List[CategoryScore] = []
                 try:
                     # Check for text format (dict with 'labels' and 'scores')
-                    if isinstance(raw_result, dict) and 'labels' in raw_result and 'scores' in raw_result:
-                        for label, score_val in zip(raw_result['labels'], raw_result['scores']):
-                             if score_val >= min_confidence:
-                                 try:
-                                     scores_list.append(CategoryScore(label=label, confidence=score_val))
-                                 except (ValueError, TypeError) as score_err:
-                                      logger.warning(f"Skipping invalid score from text pipeline batch: label='{label}', score={score_val}. Error: {score_err}")
+                    if (
+                        isinstance(raw_result, dict)
+                        and "labels" in raw_result
+                        and "scores" in raw_result
+                    ):
+                        for label, score_val in zip(raw_result["labels"], raw_result["scores"]):
+                            if score_val >= min_confidence:
+                                try:
+                                    scores_list.append(CategoryScore(label, score_val))
+                                except (ValueError, TypeError) as score_err:
+                                    logger.warning(
+                                        f"Skipping invalid score from text pipeline batch: label='{label}', score={score_val}. Error: {score_err}"
+                                    )
                     # Check for vision format (list of dicts with 'label' and 'score')
                     elif isinstance(raw_result, list):
-                         for item in raw_result:
-                              try:
-                                  score_val = item['score']
-                                  label = item['label']
-                                  if score_val >= min_confidence:
-                                      scores_list.append(CategoryScore(label=label, confidence=score_val))
-                              except (KeyError, ValueError, TypeError) as item_err:
-                                   logger.warning(f"Skipping invalid item in vision result list from batch: {item}. Error: {item_err}")
+                        for item in raw_result:
+                            try:
+                                score_val = item["score"]
+                                label = item["label"]
+                                if score_val >= min_confidence:
+                                    scores_list.append(CategoryScore(label, score_val))
+                            except (KeyError, ValueError, TypeError) as item_err:
+                                logger.warning(
+                                    f"Skipping invalid item in vision result list from batch: {item}. Error: {item_err}"
+                                )
                     else:
-                         logger.warning(f"Unexpected raw result format in batch item from model '{model_id}': {type(raw_result)}. Cannot extract scores.")
+                        logger.warning(
+                            f"Unexpected raw result format in batch item from model '{model_id}': {type(raw_result)}. Cannot extract scores."
+                        )
                 except Exception as proc_err:
-                     logger.error(f"Error processing result item in batch: {proc_err}", exc_info=True)
-                     # scores_list remains empty for this item
+                    logger.error(
+                        f"Error processing result item in batch: {proc_err}", exc_info=True
+                    )
+                    # scores_list remains empty for this item
+                # --- Determine top category and score ---
+                scores_list.sort(key=lambda s: s.score, reverse=True)
+                top_category = scores_list[0].label
+                top_score = scores_list[0].score
+                # --- End Determine top category ---
                 # Append result object for this item
-                batch_results_list.append(ClassificationResult(
-                     model_id=model_id,
-                     using=effective_using,
-                     timestamp=timestamp, # Use same timestamp for batch
-                     parameters=parameters, # Use same params for batch
-                     scores=scores_list
-                ))
+                batch_results_list.append(
+                    ClassificationResult(
+                        scores=scores_list,  # Pass the full list, init will sort/filter
+                        model_id=model_id,
+                        using=effective_using,
+                        timestamp=timestamp,  # Use same timestamp for batch
+                        parameters=parameters,  # Use same params for batch
+                    )
+                )
                 # --- End Processing --- #
             if len(batch_results_list) != total_items:
-                 logger.warning(f"Batch classification returned {len(batch_results_list)} results, but expected {total_items}. Results might be incomplete or misaligned.")
+                logger.warning(
+                    f"Batch classification returned {len(batch_results_list)} results, but expected {total_items}. Results might be incomplete or misaligned."
+                )
             return batch_results_list
         except Exception as e:
-             logger.error(f"Batch classification failed for model '{model_id}': {e}", exc_info=True)
-             # Return list of empty results?
-             # return [ClassificationResult(model_id=model_id, s=engine_type, timestamp=timestamp, parameters=parameters, scores=[]) for _ in item_contents]
-             raise ClassificationError(f"Batch classification failed using model '{model_id}'. Error: {e}") from e
+            logger.error(f"Batch classification failed for model '{model_id}': {e}", exc_info=True)
+            # Return list of empty results?
+            # return [ClassificationResult(model_id=model_id, s=engine_type, timestamp=timestamp, parameters=parameters, scores=[]) for _ in item_contents]
+            raise ClassificationError(
+                f"Batch classification failed using model '{model_id}'. Error: {e}"
+            ) from e

natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

natural-pdf 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl