PyPI - natural-pdf - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

natural-pdf 0.1.9py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

natural_pdf/classification/manager.py CHANGED Viewed

@@ -161,7 +161,7 @@ class ClassificationManager:
     def classify_item(
         self,
         item_content: Union[str, Image.Image],
-        categories: List[str],
+        labels: List[str],
         model_id: Optional[str] = None,
         using: Optional[str] = None,
         min_confidence: float = 0.0,
@@ -193,13 +193,13 @@ class ClassificationManager:
                     else self.DEFAULT_VISION_MODEL
                 )
-        if not categories:
-            raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
         pipeline_instance = self._get_pipeline(model_id, effective_using)
         timestamp = datetime.now()
         parameters = {  # Store parameters used for this run
-            "categories": categories,
+            "labels": labels,
             "model_id": model_id,
             "using": effective_using,
             "min_confidence": min_confidence,
@@ -214,7 +214,7 @@ class ClassificationManager:
             # Handle potential kwargs for specific pipelines if needed
             # The zero-shot pipelines expect `candidate_labels`
             result_raw = pipeline_instance(
-                item_content, candidate_labels=categories, multi_label=multi_label, **kwargs
+                item_content, candidate_labels=labels, multi_label=multi_label, **kwargs
             )
             logger.debug(f"Raw pipeline result: {result_raw}")
@@ -226,7 +226,7 @@ class ClassificationManager:
                 for label, score_val in zip(result_raw["labels"], result_raw["scores"]):
                     if score_val >= min_confidence:
                         try:
-                            scores_list.append(CategoryScore(label=label, confidence=score_val))
+                            scores_list.append(CategoryScore(label, score_val))
                         except (ValueError, TypeError) as score_err:
                             logger.warning(
                                 f"Skipping invalid score from text pipeline: label='{label}', score={score_val}. Error: {score_err}"
@@ -241,7 +241,7 @@ class ClassificationManager:
                     label = item["label"]
                     if score_val >= min_confidence:
                         try:
-                            scores_list.append(CategoryScore(label=label, confidence=score_val))
+                            scores_list.append(CategoryScore(label, score_val))
                         except (ValueError, TypeError) as score_err:
                             logger.warning(
                                 f"Skipping invalid score from vision pipeline: label='{label}', score={score_val}. Error: {score_err}"
@@ -253,13 +253,15 @@ class ClassificationManager:
                 # Return empty result?
                 # scores_list = []
-            return ClassificationResult(
+            # ClassificationResult now calculates top score/category internally
+            result_obj = ClassificationResult(
+                scores=scores_list,  # Pass the filtered list
                 model_id=model_id,
                 using=effective_using,
-                timestamp=timestamp,
                 parameters=parameters,
-                scores=scores_list,
+                timestamp=timestamp,
             )
+            return result_obj
             # --- End Processing --- #
         except Exception as e:
@@ -273,7 +275,7 @@ class ClassificationManager:
     def classify_batch(
         self,
         item_contents: List[Union[str, Image.Image]],
-        categories: List[str],
+        labels: List[str],
         model_id: Optional[str] = None,
         using: Optional[str] = None,
         min_confidence: float = 0.0,
@@ -307,13 +309,13 @@ class ClassificationManager:
                     else self.DEFAULT_VISION_MODEL
                 )
-        if not categories:
-            raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
         pipeline_instance = self._get_pipeline(model_id, effective_using)
         timestamp = datetime.now()  # Single timestamp for the batch run
         parameters = {  # Parameters for the whole batch
-            "categories": categories,
+            "labels": labels,
             "model_id": model_id,
             "using": effective_using,
             "min_confidence": min_confidence,
@@ -331,7 +333,7 @@ class ClassificationManager:
             # Use pipeline directly for batching
             results_iterator = pipeline_instance(
                 item_contents,
-                candidate_labels=categories,
+                candidate_labels=labels,
                 multi_label=multi_label,
                 batch_size=batch_size,
                 **kwargs,
@@ -362,9 +364,7 @@ class ClassificationManager:
                         for label, score_val in zip(raw_result["labels"], raw_result["scores"]):
                             if score_val >= min_confidence:
                                 try:
-                                    scores_list.append(
-                                        CategoryScore(label=label, confidence=score_val)
-                                    )
+                                    scores_list.append(CategoryScore(label, score_val))
                                 except (ValueError, TypeError) as score_err:
                                     logger.warning(
                                         f"Skipping invalid score from text pipeline batch: label='{label}', score={score_val}. Error: {score_err}"
@@ -376,9 +376,7 @@ class ClassificationManager:
                                 score_val = item["score"]
                                 label = item["label"]
                                 if score_val >= min_confidence:
-                                    scores_list.append(
-                                        CategoryScore(label=label, confidence=score_val)
-                                    )
+                                    scores_list.append(CategoryScore(label, score_val))
                             except (KeyError, ValueError, TypeError) as item_err:
                                 logger.warning(
                                     f"Skipping invalid item in vision result list from batch: {item}. Error: {item_err}"
@@ -394,14 +392,20 @@ class ClassificationManager:
                     )
                     # scores_list remains empty for this item
+                # --- Determine top category and score ---
+                scores_list.sort(key=lambda s: s.score, reverse=True)
+                top_category = scores_list[0].label
+                top_score = scores_list[0].score
+                # --- End Determine top category ---
                 # Append result object for this item
                 batch_results_list.append(
                     ClassificationResult(
+                        scores=scores_list,  # Pass the full list, init will sort/filter
                         model_id=model_id,
                         using=effective_using,
                         timestamp=timestamp,  # Use same timestamp for batch
                         parameters=parameters,  # Use same params for batch
-                        scores=scores_list,
                     )
                 )
                 # --- End Processing --- #

natural_pdf/classification/mixin.py CHANGED Viewed

@@ -44,9 +44,9 @@ class ClassificationMixin:
     def classify(
         self,
-        categories: List[str],
-        model: Optional[str] = None,  # Default handled by manager
-        using: Optional[str] = None,  # Renamed parameter
+        labels: List[str],
+        model: Optional[str] = None,
+        using: Optional[str] = None,
         min_confidence: float = 0.0,
         analysis_key: str = "classification",  # Default key
         multi_label: bool = False,
@@ -60,7 +60,7 @@ class ClassificationMixin:
         result under that key.
         Args:
-            categories: A list of string category names.
+            labels: A list of string category names.
             model: Model identifier (e.g., 'text', 'vision', HF ID). Defaults handled by manager.
             using: Optional processing mode ('text' or 'vision'). If None, inferred by manager.
             min_confidence: Minimum confidence threshold for results (0.0-1.0).
@@ -103,9 +103,9 @@ class ClassificationMixin:
             # Manager now returns a ClassificationResult object
             result_obj: ClassificationResult = manager.classify_item(
                 item_content=content,
-                categories=categories,
-                model_id=effective_model_id,  # Pass the resolved model ID
-                using=inferred_using,  # Pass renamed argument
+                labels=labels,
+                model_id=effective_model_id,
+                using=inferred_using,
                 min_confidence=min_confidence,
                 multi_label=multi_label,
                 **kwargs,

natural_pdf/classification/results.py CHANGED Viewed

@@ -11,19 +11,19 @@ logger = logging.getLogger(__name__)
 class CategoryScore:
     """Represents a category and its confidence score from classification."""
-    category: str
+    label: str
     score: float
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary for serialization."""
-        return {"category": self.category, "score": self.score}
+        return {"category": self.label, "score": self.score}
 @dataclass
 class ClassificationResult:
     """Results from a classification operation."""
-    category: str
+    category: Optional[str]  # Can be None if scores are empty
     score: float
     scores: List[CategoryScore]
     model_id: str
@@ -33,17 +33,25 @@ class ClassificationResult:
     def __init__(
         self,
-        category: str,
-        score: float,
-        scores: List[CategoryScore],
+        scores: List[CategoryScore],  # Now the primary source
         model_id: str,
         using: str,
         parameters: Optional[Dict[str, Any]] = None,
         timestamp: Optional[datetime] = None,
     ):
-        self.category = category
-        self.score = score
-        self.scores = scores
+        # Determine top category and score from the scores list
+        if scores:
+            # Sort scores descending by score to find the top one
+            sorted_scores = sorted(scores, key=lambda s: s.score, reverse=True)
+            self.category = sorted_scores[0].label
+            self.score = sorted_scores[0].score
+            self.scores = sorted_scores  # Store the sorted list
+        else:
+            # Handle empty scores list
+            self.category = None
+            self.score = 0.0
+            self.scores = []  # Store empty list
         self.model_id = model_id
         self.using = using
         self.parameters = parameters or {}

natural_pdf/collections/mixins.py CHANGED Viewed

@@ -109,3 +109,20 @@ class ApplyMixin:
             return PageCollection(results)
         return results
+    def filter(self: Any, predicate: Callable[[Any], bool]) -> Any:
+        """
+        Filters the collection based on a predicate function.
+        Args:
+            predicate: A function that takes an item and returns True if the item
+                       should be included in the result, False otherwise.
+        Returns:
+            A new collection of the same type containing only the items
+            for which the predicate returned True.
+        """
+        items_iterable = self._get_items_for_apply()
+        filtered_items = [item for item in items_iterable if predicate(item)]
+        return type(self)(filtered_items)

natural_pdf/collections/pdf_collection.py CHANGED Viewed

@@ -519,7 +519,7 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
         return self
-    def categorize(self, categories: List[str], **kwargs):
+    def categorize(self, labels: List[str], **kwargs):
         """Categorizes PDFs in the collection based on content or features."""
         # Implementation requires integrating with classification models or logic
         raise NotImplementedError("categorize requires classification implementation.")
@@ -570,85 +570,101 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
     # --- Classification Method --- #
     def classify_all(
         self,
-        categories: List[str],
-        model: str = "text",
+        labels: List[str],
+        using: Optional[str] = None,  # Default handled by PDF.classify -> manager
+        model: Optional[str] = None,  # Optional model ID
         max_workers: Optional[int] = None,
+        analysis_key: str = "classification",  # Key for storing result in PDF.analyses
         **kwargs,
     ) -> "PDFCollection":
         """
-        Classify all pages across all PDFs in the collection, potentially in parallel.
+        Classify each PDF document in the collection, potentially in parallel.
-        This method uses the unified `classify_all` approach, delegating page
-        classification to each PDF's `classify_pages` method.
-        It displays a progress bar tracking individual pages.
+        This method delegates classification to each PDF object's `classify` method.
+        By default, uses the full extracted text of the PDF.
+        If `using='vision'`, it classifies the first page's image, but ONLY if
+        the PDF has a single page (raises ValueError otherwise).
         Args:
-            categories: A list of string category names.
-            model: Model identifier ('text', 'vision', or specific HF ID).
+            labels: A list of string category names.
+            using: Processing mode ('text', 'vision'). If None, manager infers (defaulting to text).
+            model: Optional specific model identifier (e.g., HF ID). If None, manager uses default for 'using' mode.
             max_workers: Maximum number of threads to process PDFs concurrently.
                          If None or 1, processing is sequential.
-            **kwargs: Additional arguments passed down to `pdf.classify_pages` and
-                      subsequently to `page.classify` (e.g., device,
-                      confidence_threshold, resolution).
+            analysis_key: Key under which to store the ClassificationResult in each PDF's `analyses` dict.
+            **kwargs: Additional arguments passed down to `pdf.classify` (e.g., device,
+                      min_confidence, multi_label, text extraction options).
         Returns:
             Self for method chaining.
         Raises:
-            ValueError: If categories list is empty.
-            ClassificationError: If classification fails for any page (will stop processing).
+            ValueError: If labels list is empty, or if using='vision' on a multi-page PDF.
+            ClassificationError: If classification fails for any PDF (will stop processing).
             ImportError: If classification dependencies are missing.
         """
         PDF = self._get_pdf_class()
-        if not categories:
-            raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
+        if not self._pdfs:
+            logger.warning("PDFCollection is empty, skipping classification.")
+            return self
+        mode_desc = f"using='{using}'" if using else f"model='{model}'" if model else "default text"
         logger.info(
-            f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')..."
+            f"Starting classification for {len(self._pdfs)} PDFs in collection ({mode_desc})..."
         )
-        # Calculate total pages for the progress bar
-        total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
-        if total_pages == 0:
-            logger.warning("No pages found in the PDF collection to classify.")
-            return self
         progress_bar = tqdm(
-            total=total_pages, desc=f"Classifying Pages (model: {model})", unit="page"
+            total=len(self._pdfs), desc=f"Classifying PDFs ({mode_desc})", unit="pdf"
         )
         # Worker function
         def _process_pdf_classification(pdf: PDF):
             thread_id = threading.current_thread().name
             pdf_path = pdf.path
-            logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
+            logger.debug(f"[{thread_id}] Starting classification process for PDF: {pdf_path}")
             start_time = time.monotonic()
             try:
-                # Call classify_pages on the PDF, passing the progress callback
-                pdf.classify_pages(
-                    categories=categories,
+                # Call classify directly on the PDF object
+                pdf.classify(
+                    labels=labels,
+                    using=using,
                     model=model,
-                    progress_callback=progress_bar.update,
-                    **kwargs,
+                    analysis_key=analysis_key,
+                    **kwargs,  # Pass other relevant args like min_confidence, multi_label
                 )
                 end_time = time.monotonic()
                 logger.debug(
-                    f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
+                    f"[{thread_id}] Finished classification for PDF: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
                 )
+                progress_bar.update(1)  # Update progress bar upon success
                 return pdf_path, None  # Return path and no error
-            except Exception as e:
+            except ValueError as ve:
+                # Catch specific error for vision on multi-page PDF
                 end_time = time.monotonic()
-                # Error is logged within classify_pages, but log summary here
                 logger.error(
-                    f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
+                    f"[{thread_id}] Skipped classification for {pdf_path} after {end_time - start_time:.2f}s: {ve}",
                     exc_info=False,
                 )
-                # Close progress bar immediately on error to avoid hanging
-                progress_bar.close()
+                progress_bar.update(1)  # Still update progress bar
+                return pdf_path, ve  # Return the specific ValueError
+            except Exception as e:
+                end_time = time.monotonic()
+                logger.error(
+                    f"[{thread_id}] Failed classification process for PDF {pdf_path} after {end_time - start_time:.2f}s: {e}",
+                    exc_info=True,  # Log full traceback for unexpected errors
+                )
+                # Close progress bar immediately on critical error to avoid hanging
+                if not progress_bar.disable:
+                    progress_bar.close()
                 # Re-raise the exception to stop the entire collection processing
-                raise
+                raise ClassificationError(f"Classification failed for {pdf_path}: {e}") from e
         # Use ThreadPoolExecutor for parallel processing if max_workers > 1
+        processed_count = 0
+        skipped_count = 0
         try:
             if max_workers is not None and max_workers > 1:
                 logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
@@ -659,23 +675,39 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
                     for pdf in self._pdfs:
                         futures.append(executor.submit(_process_pdf_classification, pdf))
-                    # Wait for all futures to complete (progress updated by callback)
-                    # Exceptions are raised by future.result() if worker failed
+                    # Wait for all futures to complete
+                    # Progress updated within worker
                     for future in concurrent.futures.as_completed(futures):
-                        future.result()  # Raise exception if worker failed
+                        processed_count += 1
+                        pdf_path, error = (
+                            future.result()
+                        )  # Raise ClassificationError if worker failed critically
+                        if isinstance(error, ValueError):
+                            # Logged in worker, just count as skipped
+                            skipped_count += 1
             else:  # Sequential processing
                 logger.info("Classifying PDFs sequentially.")
                 for pdf in self._pdfs:
-                    _process_pdf_classification(pdf)
-            logger.info("Finished classification across the collection.")
+                    processed_count += 1
+                    pdf_path, error = _process_pdf_classification(
+                        pdf
+                    )  # Raise ClassificationError if worker failed critically
+                    if isinstance(error, ValueError):
+                        skipped_count += 1
+            final_message = (
+                f"Finished classification across the collection. Processed: {processed_count}"
+            )
+            if skipped_count > 0:
+                final_message += f", Skipped (e.g., vision on multi-page): {skipped_count}"
+            logger.info(final_message + ".")
         finally:
-            # Ensure progress bar is closed even if errors occurred elsewhere
+            # Ensure progress bar is closed properly
             if not progress_bar.disable and progress_bar.n < progress_bar.total:
-                progress_bar.close()
-            elif progress_bar.disable is False:
+                progress_bar.n = progress_bar.total  # Ensure it reaches 100%
+            if not progress_bar.disable:
                 progress_bar.close()
         return self

natural_pdf/core/pdf.py CHANGED Viewed

@@ -84,7 +84,7 @@ except ImportError:
 # End Deskew Imports
-class PDF(ExtractionMixin, ExportMixin):
+class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     """
     Enhanced PDF wrapper built on top of pdfplumber.
@@ -194,6 +194,7 @@ class PDF(ExtractionMixin, ExportMixin):
         self._initialize_managers()
         self._initialize_highlighter()
+        self.analyses: Dict[str, Any] = {}
     def _initialize_managers(self):
         """Initialize manager instances based on DEFAULT_MANAGERS."""
@@ -1243,7 +1244,7 @@ class PDF(ExtractionMixin, ExportMixin):
     def classify_pages(
         self,
-        categories: List[str],
+        labels: List[str],
         model: Optional[str] = None,
         pages: Optional[Union[Iterable[int], range, slice]] = None,
         analysis_key: str = "classification",
@@ -1254,7 +1255,7 @@ class PDF(ExtractionMixin, ExportMixin):
         Classifies specified pages of the PDF.
         Args:
-            categories: List of category names
+            labels: List of category names
             model: Model identifier ('text', 'vision', or specific HF ID)
             pages: Page indices, slice, or None for all pages
             analysis_key: Key to store results in page's analyses dict
@@ -1264,8 +1265,8 @@ class PDF(ExtractionMixin, ExportMixin):
         Returns:
             Self for method chaining
         """
-        if not categories:
-            raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
         try:
             manager = self.get_manager("classification")
@@ -1332,7 +1333,7 @@ class PDF(ExtractionMixin, ExportMixin):
         try:
             batch_results = manager.classify_batch(
                 item_contents=page_contents,
-                categories=categories,
+                labels=labels,
                 model_id=model,
                 using=inferred_using,
                 **kwargs,
@@ -1537,3 +1538,58 @@ class PDF(ExtractionMixin, ExportMixin):
                 raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
         else:
             raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
+    # --- Classification Mixin Implementation --- #
+    def _get_classification_manager(self) -> "ClassificationManager":
+        """Returns the ClassificationManager instance for this PDF."""
+        try:
+            return self.get_manager("classification")
+        except (KeyError, RuntimeError) as e:
+            raise AttributeError(f"Could not retrieve ClassificationManager: {e}") from e
+    def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, Image.Image]:
+        """
+        Provides the content for classifying the entire PDF.
+        Args:
+            model_type: 'text' or 'vision'.
+            **kwargs: Additional arguments (e.g., for text extraction or image rendering).
+        Returns:
+            Extracted text (str) or the first page's image (PIL.Image).
+        Raises:
+            ValueError: If model_type is 'vision' and PDF has != 1 page,
+                      or if model_type is unsupported, or if content cannot be generated.
+        """
+        if model_type == "text":
+            try:
+                # Extract text from the whole document
+                text = self.extract_text(**kwargs)  # Pass relevant kwargs
+                if not text or text.isspace():
+                    raise ValueError("PDF contains no extractable text for classification.")
+                return text
+            except Exception as e:
+                logger.error(f"Error extracting text for PDF classification: {e}")
+                raise ValueError("Failed to extract text for classification.") from e
+        elif model_type == "vision":
+            if len(self.pages) == 1:
+                # Use the single page's content method
+                try:
+                    return self.pages[0]._get_classification_content(model_type="vision", **kwargs)
+                except Exception as e:
+                    logger.error(f"Error getting image from single page for classification: {e}")
+                    raise ValueError("Failed to get image from single page.") from e
+            elif len(self.pages) == 0:
+                raise ValueError("Cannot classify empty PDF using vision model.")
+            else:
+                raise ValueError(
+                    f"Vision classification for a PDF object is only supported for single-page PDFs. "
+                    f"This PDF has {len(self.pages)} pages. Use pdf.pages[0].classify() or pdf.classify_pages()."
+                )
+        else:
+            raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
+    # --- End Classification Mixin Implementation ---

natural_pdf/elements/collections.py CHANGED Viewed

@@ -20,6 +20,7 @@ from typing import (
 )
 from pdfplumber.utils.geometry import objects_to_bbox
+from PIL import Image, ImageDraw, ImageFont
 # New Imports
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -1239,7 +1240,7 @@ class ElementCollection(
     # --- Classification Method --- #
     def classify_all(
         self,
-        categories: List[str],
+        labels: List[str],
         model: Optional[str] = None,
         using: Optional[str] = None,
         min_confidence: float = 0.0,
@@ -1253,7 +1254,7 @@ class ElementCollection(
         """Classifies all elements in the collection in batch.
         Args:
-            categories: List of category labels.
+            labels: List of category labels.
             model: Model ID (or alias 'text', 'vision').
             using: Optional processing mode ('text' or 'vision'). Inferred if None.
             min_confidence: Minimum confidence threshold.
@@ -1326,7 +1327,7 @@ class ElementCollection(
         # Call manager's batch classify
         batch_results: List[ClassificationResult] = manager.classify_batch(
             item_contents=items_to_classify,
-            categories=categories,
+            labels=labels,
             model_id=model,
             using=inferred_using,
             min_confidence=min_confidence,
@@ -2263,3 +2264,106 @@ class PageCollection(Generic[P], ApplyMixin):
         )
     # --- End Deskew Method --- #
+    def to_image(
+        self,
+        page_width: int = 300,
+        cols: Optional[int] = 4,
+        rows: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        spacing: int = 10,
+        add_labels: bool = True,
+        show_category: bool = False,  # Add new flag
+    ) -> Optional["Image.Image"]:
+        """
+        Generate a grid of page images for this collection.
+        Args:
+            page_width: Width in pixels for rendering individual pages
+            cols: Number of columns in grid (default: 4)
+            rows: Number of rows in grid (calculated automatically if None)
+            max_pages: Maximum number of pages to include (default: all)
+            spacing: Spacing between page thumbnails in pixels
+            add_labels: Whether to add page number labels
+            show_category: Whether to add category and confidence labels (if available)
+        Returns:
+            PIL Image of the page grid or None if no pages
+        """
+        if not self.pages:
+            logger.warning("Cannot generate image for empty PageCollection")
+            return None
+        # Limit pages if max_pages is specified
+        pages_to_render = self.pages[:max_pages] if max_pages else self.pages
+        # Load font once outside the loop
+        font = ImageFont.load_default(16) if add_labels else None
+        # Render individual page images
+        page_images = []
+        for page in pages_to_render:
+            img = page.to_image(width=page_width)
+            # Add page number label
+            if add_labels and font:  # Check if font was loaded
+                draw = ImageDraw.Draw(img)
+                pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf else ""
+                label_text = f"p{page.number} - {pdf_name}"
+                # Add category if requested and available
+                if show_category:
+                    category = getattr(page, "category", None)
+                    confidence = getattr(page, "category_confidence", None)
+                    if category is not None and confidence is not None:
+                        category_str = f"{category} {confidence:.3f}"
+                        label_text += f"\n{category_str}"
+                # Calculate bounding box for multi-line text
+                # Use (5, 5) as top-left anchor for textbbox calculation for padding
+                # Use multiline_textbbox for accurate bounds with newlines
+                bbox = draw.multiline_textbbox((5, 5), label_text, font=font)
+                # Add padding to the calculated bbox for the white background
+                bg_rect = (bbox[0] - 2, bbox[1] - 2, bbox[2] + 2, bbox[3] + 2)
+                # Draw white background rectangle
+                draw.rectangle(bg_rect, fill=(255, 255, 255))
+                # Draw the potentially multi-line text using multiline_text
+                draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font)
+            page_images.append(img)
+        # Calculate grid dimensions if not provided
+        if not rows and not cols:
+            # Default to a square-ish grid
+            cols = min(4, int(len(page_images) ** 0.5) + 1)
+            rows = (len(page_images) + cols - 1) // cols
+        elif rows and not cols:
+            cols = (len(page_images) + rows - 1) // rows
+        elif cols and not rows:
+            rows = (len(page_images) + cols - 1) // cols
+        # Get maximum dimensions for consistent grid cells
+        max_width = max(img.width for img in page_images)
+        max_height = max(img.height for img in page_images)
+        # Create grid image
+        grid_width = cols * max_width + (cols + 1) * spacing
+        grid_height = rows * max_height + (rows + 1) * spacing
+        grid_img = Image.new("RGB", (grid_width, grid_height), (255, 255, 255))
+        # Place images in grid
+        for i, img in enumerate(page_images):
+            if i >= rows * cols:
+                break
+            row = i // cols
+            col = i % cols
+            x = col * max_width + (col + 1) * spacing
+            y = row * max_height + (row + 1) * spacing
+            grid_img.paste(img, (x, y))
+        return grid_img

{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.9
+Version: 0.1.10
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.10.dist-info}/RECORD RENAMED Viewed

@@ -15,19 +15,19 @@ natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh
 natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuKvUGQfEBI,9789
 natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
 natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
-natural_pdf/classification/manager.py,sha256=CvZd3-lN3fEhcaLXr8gYfrdBGoBgzkIeE14EqjrOAzU,17730
-natural_pdf/classification/mixin.py,sha256=llari9AIMNGy9sTaR7y1g5vtVNUwuCutbKnjbJRMYx4,6903
-natural_pdf/classification/results.py,sha256=Ia26BQxObL5sURpFmg66bfjFPCxjcO_jeP2G-S9wRgo,2289
-natural_pdf/collections/mixins.py,sha256=ufetdzHmd2_WLGBPW4eBQrzZTFpjXyVsVwBquIE47zw,4476
-natural_pdf/collections/pdf_collection.py,sha256=JnsJugE-vxYsW1ZJWmMlVv_jbyG37X-9rZK1RQyKWAY,30020
+natural_pdf/classification/manager.py,sha256=RxJch8xVu8Me6_T2Kh7ZqUNaAKlXvfyCZD0hRc4Hk6w,17929
+natural_pdf/classification/mixin.py,sha256=hhX9qWPShpOq_-mgoEq0GUWnutBnNMo3YdUlxwyNWMA,6781
+natural_pdf/classification/results.py,sha256=El1dY7cBQVOB5lP-uj52dWgH6Y7TeQgJOVcZD-OLjes,2778
+natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
+natural_pdf/collections/pdf_collection.py,sha256=obHizc2KR4ZiAspodaPOeMgfpoW3aKg_G0goBHlrFJI,32018
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
 natural_pdf/core/element_manager.py,sha256=knRN6qXxV-6KZCj2GUOyiqRi83DjJzL77TmKGeiD08Y,25144
 natural_pdf/core/highlighting_service.py,sha256=wINdRxq63_CYYA81EwuCRqhNKimn0dNKyoKWuzkirc0,31959
 natural_pdf/core/page.py,sha256=icJLu6jRbkD3iOE8r60XPkQZ8FN3ZcKo5TT5MVGkGl0,105122
-natural_pdf/core/pdf.py,sha256=Vw-L5149wO6RSfvb9sAfPDLqd9M1TdYoPHNEePh65y8,61201
+natural_pdf/core/pdf.py,sha256=gOvLumJZaHXdDwpxbX9HcC_Rea4HaYMemBdYg5GX7gQ,63837
 natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
 natural_pdf/elements/base.py,sha256=7vVCPQyEHifh4LyBuv0kLTqr_gNbbEMc4SoiJmLfEUQ,37585
-natural_pdf/elements/collections.py,sha256=YRaJxNbJrBjgwzwuSoOtEotOKh6RaTi7NRCqKiGl514,92955
+natural_pdf/elements/collections.py,sha256=AN0WrrQYfCmcRS0-PHP4RQHxxdpcWnDuH2cWnmqtDE0,97184
 natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
 natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
 natural_pdf/elements/region.py,sha256=LfyB_9DCw5Tzn_G9xsjFz2FfKBOHRqGIND4DQWoA7KM,97324
@@ -73,8 +73,8 @@ natural_pdf/utils/tqdm_utils.py,sha256=wV3RXvqog26eWEFEqjt2LkGnLswmO1GXaVGSqgS7t
 natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
 natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
 natural_pdf/widgets/viewer.py,sha256=dC_hlPlosc08gsDc3bdAa8chOKtAoH9QFU6mrGOG9vE,39532
-natural_pdf-0.1.9.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
-natural_pdf-0.1.9.dist-info/METADATA,sha256=10GX2Qesem-n8sPem4lls2EEQen4KyJVdcmQf1mt9mI,7400
-natural_pdf-0.1.9.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
-natural_pdf-0.1.9.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
-natural_pdf-0.1.9.dist-info/RECORD,,
+natural_pdf-0.1.10.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.1.10.dist-info/METADATA,sha256=gjUsfmnbqrdiHcaH6L1qiw6VX4MBlWjVj5HqlDnhuQY,7401
+natural_pdf-0.1.10.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
+natural_pdf-0.1.10.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
+natural_pdf-0.1.10.dist-info/RECORD,,

{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl

natural-pdf 0.1.9py3-none-any.whl → 0.1.10py3-none-any.whl