PyPI - natural-pdf - Versions diffs - 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

natural-pdf 0.1.40py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

natural_pdf/__init__.py +6 -7
natural_pdf/analyzers/__init__.py +6 -1
natural_pdf/analyzers/guides.py +354 -258
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +18 -4
natural_pdf/analyzers/layout/paddle.py +11 -0
natural_pdf/analyzers/layout/surya.py +2 -3
natural_pdf/analyzers/shape_detection_mixin.py +25 -34
natural_pdf/analyzers/text_structure.py +2 -2
natural_pdf/classification/manager.py +1 -1
natural_pdf/collections/mixins.py +3 -2
natural_pdf/core/highlighting_service.py +743 -32
natural_pdf/core/page.py +236 -383
natural_pdf/core/page_collection.py +1249 -0
natural_pdf/core/pdf.py +172 -83
natural_pdf/{collections → core}/pdf_collection.py +18 -11
natural_pdf/core/render_spec.py +335 -0
natural_pdf/describe/base.py +1 -1
natural_pdf/elements/__init__.py +1 -0
natural_pdf/elements/base.py +108 -83
natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
natural_pdf/elements/line.py +0 -1
natural_pdf/elements/rect.py +0 -1
natural_pdf/elements/region.py +318 -243
natural_pdf/elements/text.py +9 -7
natural_pdf/exporters/base.py +2 -2
natural_pdf/exporters/original_pdf.py +1 -1
natural_pdf/exporters/paddleocr.py +2 -4
natural_pdf/exporters/searchable_pdf.py +3 -2
natural_pdf/extraction/mixin.py +1 -3
natural_pdf/flows/collections.py +1 -69
natural_pdf/flows/element.py +4 -4
natural_pdf/flows/flow.py +1200 -243
natural_pdf/flows/region.py +707 -261
natural_pdf/ocr/ocr_options.py +0 -2
natural_pdf/ocr/utils.py +2 -1
natural_pdf/qa/document_qa.py +21 -5
natural_pdf/search/search_service_protocol.py +1 -1
natural_pdf/selectors/parser.py +2 -2
natural_pdf/tables/result.py +35 -1
natural_pdf/text_mixin.py +7 -3
natural_pdf/utils/debug.py +2 -1
natural_pdf/utils/highlighting.py +1 -0
natural_pdf/utils/layout.py +2 -2
natural_pdf/utils/packaging.py +4 -3
natural_pdf/utils/text_extraction.py +15 -12
natural_pdf/utils/visualization.py +385 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
optimization/memory_comparison.py +1 -1
optimization/pdf_analyzer.py +2 -2
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0

natural_pdf/core/pdf.py CHANGED Viewed

@@ -16,6 +16,7 @@ from typing import (
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Tuple,
     Type,
@@ -31,6 +32,7 @@ from natural_pdf.classification.manager import ClassificationError
 from natural_pdf.classification.mixin import ClassificationMixin
 from natural_pdf.classification.results import ClassificationResult
 from natural_pdf.core.highlighting_service import HighlightingService
+from natural_pdf.core.render_spec import RenderSpec, Visualizable
 from natural_pdf.elements.base import Element
 from natural_pdf.elements.region import Region
 from natural_pdf.export.mixin import ExportMixin
@@ -38,11 +40,11 @@ from natural_pdf.extraction.manager import StructuredDataManager
 from natural_pdf.extraction.mixin import ExtractionMixin
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.selectors.parser import parse_selector
-from natural_pdf.utils.locks import pdf_render_lock
 from natural_pdf.text_mixin import TextMixin
+from natural_pdf.utils.locks import pdf_render_lock
 if TYPE_CHECKING:
-    from natural_pdf.elements.collections import ElementCollection
+    from natural_pdf.elements.element_collection import ElementCollection
 try:
     from typing import Any as TypingAny
@@ -107,7 +109,6 @@ except ImportError:
 from collections.abc import Sequence
 class _LazyPageList(Sequence):
     """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
@@ -145,18 +146,18 @@ class _LazyPageList(Sequence):
     """
     def __init__(
-        self,
-        parent_pdf: "PDF",
-        plumber_pdf: "pdfplumber.PDF",
-        font_attrs=None,
+        self,
+        parent_pdf: "PDF",
+        plumber_pdf: "pdfplumber.PDF",
+        font_attrs=None,
         load_text=True,
-        indices: Optional[List[int]] = None
+        indices: Optional[List[int]] = None,
     ):
         self._parent_pdf = parent_pdf
         self._plumber_pdf = plumber_pdf
         self._font_attrs = font_attrs
         self._load_text = load_text
         # If indices is provided, this is a sliced view
         if indices is not None:
             self._indices = indices
@@ -184,23 +185,23 @@ class _LazyPageList(Sequence):
                 font_attrs=self._font_attrs,
                 load_text=self._load_text,
             )
             # Apply any stored exclusions to the newly created page
-            if hasattr(self._parent_pdf, '_exclusions'):
+            if hasattr(self._parent_pdf, "_exclusions"):
                 for exclusion_data in self._parent_pdf._exclusions:
                     exclusion_func, label = exclusion_data
                     try:
                         cached.add_exclusion(exclusion_func, label=label)
                     except Exception as e:
                         logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
             # Apply any stored regions to the newly created page
-            if hasattr(self._parent_pdf, '_regions'):
+            if hasattr(self._parent_pdf, "_regions"):
                 for region_data in self._parent_pdf._regions:
                     region_func, name = region_data
                     try:
                         region_instance = region_func(cached)
-                        if region_instance and hasattr(region_instance, '__class__'):
+                        if region_instance and hasattr(region_instance, "__class__"):
                             # Check if it's a Region-like object (avoid importing Region here)
                             cached.add_region(region_instance, name=name, source="named")
                         elif region_instance is not None:
@@ -209,7 +210,7 @@ class _LazyPageList(Sequence):
                             )
                     except Exception as e:
                         logger.warning(f"Failed to apply region to page {cached.number}: {e}")
             self._cache[index] = cached
         return cached
@@ -219,7 +220,7 @@ class _LazyPageList(Sequence):
     def __getitem__(self, key):
         if isinstance(key, slice):
-            # Get the slice of our current indices
+            # Get the slice of our current indices
             slice_indices = range(*key.indices(len(self)))
             # Extract the actual page indices for this slice
             actual_indices = [self._indices[i] for i in slice_indices]
@@ -229,7 +230,7 @@ class _LazyPageList(Sequence):
                 self._plumber_pdf,
                 font_attrs=self._font_attrs,
                 load_text=self._load_text,
-                indices=actual_indices
+                indices=actual_indices,
             )
         elif isinstance(key, int):
             if key < 0:
@@ -251,7 +252,7 @@ class _LazyPageList(Sequence):
 # --- End Lazy Page List Helper --- #
-class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
+class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, Visualizable):
     """Enhanced PDF wrapper built on top of pdfplumber.
     This class provides a fluent interface for working with PDF documents,
@@ -580,7 +581,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
                 print(f"Page {page.index} has {len(page.chars)} characters")
             ```
         """
-        from natural_pdf.elements.collections import PageCollection
+        from natural_pdf.core.page_collection import PageCollection
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
@@ -612,7 +613,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
             raise AttributeError("PDF pages not yet initialized.")
         self._exclusions = []
         # Clear exclusions only from already-created (cached) pages to avoid forcing page creation
         for i in range(len(self._pages)):
             if self._pages._cache[i] is not None:  # Only clear from existing pages
@@ -622,9 +623,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
                     logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
         return self
-    def add_exclusion(
-        self, exclusion_func, label: str = None
-    ) -> "PDF":
+    def add_exclusion(self, exclusion_func, label: str = None) -> "PDF":
         """Add an exclusion function to the PDF.
         Exclusion functions define regions of each page that should be ignored during
@@ -673,12 +672,12 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
         # Support selector strings and ElementCollection objects directly.
         # Store exclusion and apply only to already-created pages.
         # ------------------------------------------------------------------
-        from natural_pdf.elements.collections import ElementCollection  # local import
+        from natural_pdf.elements.element_collection import ElementCollection  # local import
         if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
             # Store for bookkeeping and lazy application
             self._exclusions.append((exclusion_func, label))
             # Apply only to already-created (cached) pages to avoid forcing page creation
             for i in range(len(self._pages)):
                 if self._pages._cache[i] is not None:  # Only apply to existing pages
@@ -846,11 +845,11 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
                     "include_highlights": False,
                     "exclusions": "mask" if apply_exclusions else None,
                 }
-                img = page.to_image(**to_image_kwargs)
+                # Use render() for clean image without highlights
+                img = page.render(resolution=final_resolution)
                 if img is None:
                     logger.error(f"  Failed to render page {page.number} to image.")
                     continue
-                    continue
                 images_pil.append(img)
                 page_image_map.append((page, img))
         except Exception as e:
@@ -1144,7 +1143,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
             if page_elements:
                 all_elements.extend(page_elements.elements)
-        from natural_pdf.elements.collections import ElementCollection
+        from natural_pdf.elements.element_collection import ElementCollection
         return ElementCollection(all_elements)
@@ -1238,7 +1237,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
         start_elements=None,
         end_elements=None,
         new_section_on_page_break=False,
-        boundary_inclusion="both",
+        include_boundaries="both",
     ) -> "ElementCollection":
         """
         Extract sections from the entire PDF based on start/end elements.
@@ -1250,7 +1249,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
             start_elements: Elements or selector string that mark the start of sections (optional)
             end_elements: Elements or selector string that mark the end of sections (optional)
             new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
-            boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
+            include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
         Returns:
             ElementCollection of Region objects representing the extracted sections
@@ -1259,13 +1258,13 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
             Extract sections between headers:
             ```python
             pdf = npdf.PDF("document.pdf")
             # Get sections between headers
             sections = pdf.get_sections(
                 start_elements='text[size>14]:bold',
                 end_elements='text[size>14]:bold'
             )
             # Get sections that break at page boundaries
             sections = pdf.get_sections(
                 start_elements='text:contains("Chapter")',
@@ -1286,7 +1285,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
             start_elements=start_elements,
             end_elements=end_elements,
             new_section_on_page_break=new_section_on_page_break,
-            boundary_inclusion=boundary_inclusion,
+            include_boundaries=include_boundaries,
         )
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
@@ -1423,6 +1422,36 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
                 # Re-raise exception from exporter
                 raise e
+    def _get_render_specs(
+        self,
+        mode: Literal["show", "render"] = "show",
+        color: Optional[Union[str, Tuple[int, int, int]]] = None,
+        highlights: Optional[List[Dict[str, Any]]] = None,
+        crop: Union[bool, Literal["content"]] = False,
+        crop_bbox: Optional[Tuple[float, float, float, float]] = None,
+        **kwargs,
+    ) -> List[RenderSpec]:
+        """Get render specifications for this PDF.
+        For PDF objects, this delegates to the pages collection to handle
+        multi-page rendering.
+        Args:
+            mode: Rendering mode - 'show' includes highlights, 'render' is clean
+            color: Color for highlighting pages in show mode
+            highlights: Additional highlight groups to show
+            crop: Whether to crop pages
+            crop_bbox: Explicit crop bounds
+            **kwargs: Additional parameters
+        Returns:
+            List of RenderSpec objects, one per page
+        """
+        # Delegate to pages collection
+        return self.pages._get_render_specs(
+            mode=mode, color=color, highlights=highlights, crop=crop, crop_bbox=crop_bbox, **kwargs
+        )
     def ask(
         self,
         question: str,
@@ -1447,14 +1476,20 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
             Dict containing: answer, confidence, found, page_num, source_elements, etc.
         """
         # Delegate to ask_batch and return the first result
-        results = self.ask_batch([question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs)
-        return results[0] if results else {
-            "answer": None,
-            "confidence": 0.0,
-            "found": False,
-            "page_num": None,
-            "source_elements": [],
-        }
+        results = self.ask_batch(
+            [question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs
+        )
+        return (
+            results[0]
+            if results
+            else {
+                "answer": None,
+                "confidence": 0.0,
+                "found": False,
+                "page_num": None,
+                "source_elements": [],
+            }
+        )
     def ask_batch(
         self,
@@ -1524,7 +1559,9 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
                 for _ in questions
             ]
-        logger.info(f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA...")
+        logger.info(
+            f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA..."
+        )
         # Collect all page images and metadata for batch processing
         page_images = []
@@ -1534,26 +1571,26 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
         for page in target_pages:
             # Get page image
             try:
-                page_image = page.to_image(resolution=150, include_highlights=False)
+                # Use render() for clean image without highlights
+                page_image = page.render(resolution=150)
                 if page_image is None:
                     logger.warning(f"Failed to render image for page {page.number}, skipping")
                     continue
                 # Get text elements for word boxes
                 elements = page.find_all("text")
                 if not elements:
                     logger.warning(f"No text elements found on page {page.number}")
                     word_boxes = []
                 else:
-                    word_boxes = qa_engine._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
+                    word_boxes = qa_engine._get_word_boxes_from_elements(
+                        elements, offset_x=0, offset_y=0
+                    )
                 page_images.append(page_image)
                 page_word_boxes.append(word_boxes)
-                page_metadata.append({
-                    "page_number": page.number,
-                    "page_object": page
-                })
+                page_metadata.append({"page_number": page.number, "page_object": page})
             except Exception as e:
                 logger.warning(f"Error processing page {page.number}: {e}")
                 continue
@@ -1573,22 +1610,24 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
         # Process all questions against all pages in batch
         all_results = []
         for question_text in questions:
             question_results = []
             # Ask this question against each page (but in batch per page)
-            for i, (page_image, word_boxes, page_meta) in enumerate(zip(page_images, page_word_boxes, page_metadata)):
+            for i, (page_image, word_boxes, page_meta) in enumerate(
+                zip(page_images, page_word_boxes, page_metadata)
+            ):
                 try:
-                    # Use the DocumentQA batch interface
+                    # Use the DocumentQA batch interface
                     page_result = qa_engine.ask(
                         image=page_image,
                         question=question_text,
                         word_boxes=word_boxes,
                         min_confidence=min_confidence,
-                        **kwargs
+                        **kwargs,
                     )
                     if page_result and page_result.found:
                         # Add page metadata to result
                         page_result_dict = {
@@ -1596,30 +1635,34 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
                             "confidence": page_result.confidence,
                             "found": page_result.found,
                             "page_num": page_meta["page_number"],
-                            "source_elements": getattr(page_result, 'source_elements', []),
-                            "start": getattr(page_result, 'start', -1),
-                            "end": getattr(page_result, 'end', -1),
+                            "source_elements": getattr(page_result, "source_elements", []),
+                            "start": getattr(page_result, "start", -1),
+                            "end": getattr(page_result, "end", -1),
                         }
                         question_results.append(page_result_dict)
                 except Exception as e:
-                    logger.warning(f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}")
+                    logger.warning(
+                        f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}"
+                    )
                     continue
             # Sort results by confidence and take the best one for this question
             question_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
             if question_results:
                 all_results.append(question_results[0])
             else:
                 # No results found for this question
-                all_results.append({
-                    "answer": None,
-                    "confidence": 0.0,
-                    "found": False,
-                    "page_num": None,
-                    "source_elements": [],
-                })
+                all_results.append(
+                    {
+                        "answer": None,
+                        "confidence": 0.0,
+                        "found": False,
+                        "page_num": None,
+                        "source_elements": [],
+                    }
+                )
         return all_results
@@ -1804,17 +1847,19 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
             logger.warning("No pages selected for text update.")
             return self
-        logger.info(f"Starting text update for pages: {target_page_indices} with selector='{selector}'")
+        logger.info(
+            f"Starting text update for pages: {target_page_indices} with selector='{selector}'"
+        )
         for page_idx in target_page_indices:
             page = self._pages[page_idx]
             try:
-                            page.update_text(
-                transform=transform,
-                selector=selector,
-                max_workers=max_workers,
-                progress_callback=progress_callback,
-            )
+                page.update_text(
+                    transform=transform,
+                    selector=selector,
+                    max_workers=max_workers,
+                    progress_callback=progress_callback,
+                )
             except Exception as e:
                 logger.error(f"Error during text update on page {page_idx}: {e}")
                 logger.error(f"Error during text update on page {page_idx}: {e}")
@@ -1834,9 +1879,10 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
             raise AttributeError("PDF pages not initialized yet.")
         if isinstance(key, slice):
-            from natural_pdf.elements.collections import PageCollection
+            from natural_pdf.core.page_collection import PageCollection
             # Use the lazy page list's slicing which returns another _LazyPageList
-            lazy_slice = self._pages[key]
+            lazy_slice = self._pages[key]
             # Wrap in PageCollection for compatibility
             return PageCollection(lazy_slice)
         elif isinstance(key, int):
@@ -2179,10 +2225,9 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
             try:
                 for page in tqdm(self.pages, desc="Rendering Pages"):
-                    img = page.to_image(
+                    # Use render() for clean images
+                    img = page.render(
                         resolution=resolution,
-                        include_highlights=include_highlights,
-                        labels=labels,
                         **kwargs,
                     )
                     if img:
@@ -2412,3 +2457,47 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
                     os.unlink(path)
             except Exception as e:
                 logger.warning(f"Failed to clean up temporary file '{path}': {e}")
+    def analyze_layout(self, *args, **kwargs) -> "ElementCollection[Region]":
+        """
+        Analyzes the layout of all pages in the PDF.
+        This is a convenience method that calls analyze_layout on the PDF's
+        page collection.
+        Args:
+            *args: Positional arguments passed to pages.analyze_layout().
+            **kwargs: Keyword arguments passed to pages.analyze_layout().
+        Returns:
+            An ElementCollection of all detected Region objects.
+        """
+        return self.pages.analyze_layout(*args, **kwargs)
+    def highlights(self, show: bool = False) -> "HighlightContext":
+        """
+        Create a highlight context for accumulating highlights.
+        This allows for clean syntax to show multiple highlight groups:
+        Example:
+            with pdf.highlights() as h:
+                h.add(pdf.find_all('table'), label='tables', color='blue')
+                h.add(pdf.find_all('text:bold'), label='bold text', color='red')
+                h.show()
+        Or with automatic display:
+            with pdf.highlights(show=True) as h:
+                h.add(pdf.find_all('table'), label='tables')
+                h.add(pdf.find_all('text:bold'), label='bold')
+                # Automatically shows when exiting the context
+        Args:
+            show: If True, automatically show highlights when exiting context
+        Returns:
+            HighlightContext for accumulating highlights
+        """
+        from natural_pdf.core.highlighting_service import HighlightContext
+        return HighlightContext(self, show_on_exit=show)

natural_pdf/{collections → core}/pdf_collection.py RENAMED Viewed

@@ -588,24 +588,25 @@ class PDFCollection(
         # Get classification manager from first PDF
         try:
             first_pdf = self._pdfs[0]
-            if not hasattr(first_pdf, 'get_manager'):
+            if not hasattr(first_pdf, "get_manager"):
                 raise RuntimeError("PDFs do not support classification manager")
-            manager = first_pdf.get_manager('classification')
+            manager = first_pdf.get_manager("classification")
             if not manager or not manager.is_available():
                 raise RuntimeError("ClassificationManager is not available")
         except Exception as e:
             from natural_pdf.classification.manager import ClassificationError
             raise ClassificationError(f"Cannot access ClassificationManager: {e}") from e
         # Determine processing mode early
         inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
         # Gather content from all PDFs
         pdf_contents = []
         valid_pdfs = []
         logger.info(f"Gathering content from {len(self._pdfs)} PDFs for batch classification...")
         for pdf in self._pdfs:
             try:
                 # Get the content for classification - use the same logic as individual PDF classify
@@ -618,16 +619,18 @@ class PDFCollection(
                 elif inferred_using == "vision":
                     # For vision, we need single-page PDFs only
                     if len(pdf.pages) != 1:
-                        logger.warning(f"Skipping PDF {pdf.path}: Vision classification requires single-page PDFs")
+                        logger.warning(
+                            f"Skipping PDF {pdf.path}: Vision classification requires single-page PDFs"
+                        )
                         continue
                     # Get first page image
-                    content = pdf.pages[0].to_image()
+                    content = pdf.pages[0].render()
                 else:
                     raise ValueError(f"Unsupported using mode: {inferred_using}")
                 pdf_contents.append(content)
                 valid_pdfs.append(pdf)
             except Exception as e:
                 logger.warning(f"Skipping PDF {pdf.path}: Error getting content - {e}")
                 continue
@@ -636,7 +639,9 @@ class PDFCollection(
             logger.warning("No valid content could be gathered from PDFs for classification.")
             return self
-        logger.info(f"Gathered content from {len(valid_pdfs)} PDFs. Running batch classification...")
+        logger.info(
+            f"Gathered content from {len(valid_pdfs)} PDFs. Running batch classification..."
+        )
         # Run batch classification
         try:
@@ -651,6 +656,7 @@ class PDFCollection(
         except Exception as e:
             logger.error(f"Batch classification failed: {e}")
             from natural_pdf.classification.manager import ClassificationError
             raise ClassificationError(f"Batch classification failed: {e}") from e
         # Assign results back to PDFs
@@ -660,10 +666,11 @@ class PDFCollection(
                 f"with PDFs processed ({len(valid_pdfs)}). Cannot assign results."
             )
             from natural_pdf.classification.manager import ClassificationError
             raise ClassificationError("Batch result count mismatch with input PDFs")
         logger.info(f"Assigning {len(batch_results)} results to PDFs under key '{analysis_key}'.")
         processed_count = 0
         for pdf, result_obj in zip(valid_pdfs, batch_results):
             try:

natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl

natural-pdf 0.1.40py3-none-any.whl → 0.2.0py3-none-any.whl