PyPI - natural-pdf - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

natural-pdf 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

natural_pdf/__init__.py +7 -2
natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
natural_pdf/analyzers/text_options.py +9 -1
natural_pdf/analyzers/text_structure.py +371 -58
natural_pdf/classification/manager.py +3 -4
natural_pdf/collections/pdf_collection.py +19 -39
natural_pdf/core/element_manager.py +11 -1
natural_pdf/core/highlighting_service.py +146 -75
natural_pdf/core/page.py +287 -188
natural_pdf/core/pdf.py +57 -42
natural_pdf/elements/base.py +51 -0
natural_pdf/elements/collections.py +362 -67
natural_pdf/elements/line.py +5 -0
natural_pdf/elements/region.py +396 -23
natural_pdf/exporters/data/__init__.py +0 -0
natural_pdf/exporters/data/pdf.ttf +0 -0
natural_pdf/exporters/data/sRGB.icc +0 -0
natural_pdf/exporters/hocr.py +40 -61
natural_pdf/exporters/hocr_font.py +7 -13
natural_pdf/exporters/original_pdf.py +10 -13
natural_pdf/exporters/paddleocr.py +51 -11
natural_pdf/exporters/searchable_pdf.py +0 -10
natural_pdf/flows/__init__.py +12 -0
natural_pdf/flows/collections.py +533 -0
natural_pdf/flows/element.py +382 -0
natural_pdf/flows/flow.py +216 -0
natural_pdf/flows/region.py +458 -0
natural_pdf/search/__init__.py +65 -52
natural_pdf/search/lancedb_search_service.py +325 -0
natural_pdf/search/numpy_search_service.py +255 -0
natural_pdf/search/searchable_mixin.py +25 -71
natural_pdf/selectors/parser.py +163 -8
natural_pdf/widgets/viewer.py +22 -31
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
natural_pdf/search/haystack_search_service.py +0 -687
natural_pdf/search/haystack_utils.py +0 -474
natural_pdf/utils/tqdm_utils.py +0 -51
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0

natural_pdf/elements/collections.py CHANGED Viewed

@@ -18,12 +18,13 @@ from typing import (
     Union,
     overload,
 )
+import hashlib
 from pdfplumber.utils.geometry import objects_to_bbox
-from PIL import Image, ImageDraw, ImageFont
 # New Imports
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
+from PIL import Image, ImageDraw, ImageFont
 from tqdm.auto import tqdm
 from natural_pdf.classification.manager import ClassificationManager
@@ -37,6 +38,8 @@ from natural_pdf.export.mixin import ExportMixin
 from natural_pdf.ocr import OCROptions
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
+from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
+from tqdm.auto import tqdm
 # Potentially lazy imports for optional dependencies needed in save_pdf
 try:
@@ -46,7 +49,6 @@ except ImportError:
 try:
     from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
-    pass
 except ImportError:
     create_searchable_pdf = None
@@ -61,8 +63,9 @@ logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
-    from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
+    from natural_pdf.core.pdf import PDF  # ---> ADDED PDF type hint
     from natural_pdf.elements.region import Region
+    from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
 T = TypeVar("T")
 P = TypeVar("P", bound="Page")
@@ -840,6 +843,7 @@ class ElementCollection(
         labels: bool = True,  # Use 'labels' consistent with service
         legend_position: str = "right",
         render_ocr: bool = False,
+        width: Optional[int] = None,  # Add width parameter
     ) -> Optional["Image.Image"]:
         """
         Generates a temporary preview image highlighting elements in this collection
@@ -862,6 +866,7 @@ class ElementCollection(
             labels: Whether to include a legend for the temporary highlights.
             legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
             render_ocr: Whether to render OCR text.
+            width: Optional width for the output image in pixels.
         Returns:
             PIL Image object of the temporary preview, or None if rendering fails or
@@ -922,6 +927,7 @@ class ElementCollection(
                 page_index=page.index,
                 temporary_highlights=highlight_data_list,
                 scale=scale,
+                width=width,  # Pass the width parameter
                 labels=labels,  # Use 'labels'
                 legend_position=legend_position,
                 render_ocr=render_ocr,
@@ -1159,10 +1165,96 @@ class ElementCollection(
         Args:
             selector: CSS-like selector string
+            contains: How to determine if elements are inside: 'all' (fully inside),
+                      'any' (any overlap), or 'center' (center point inside).
+                      (default: "all")
             apply_exclusions: Whether to exclude elements in exclusion regions
         """
         return self.apply(lambda element: element.find(selector, **kwargs))
+    @overload
+    def find_all(
+        self,
+        *,
+        text: str,
+        contains: str = "all",
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    @overload
+    def find_all(
+        self,
+        selector: str,
+        *,
+        contains: str = "all",
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    def find_all(
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        contains: str = "all",
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection":
+        """
+        Find all elements within each element of this collection matching the selector OR text,
+        and return a flattened collection of all found sub-elements.
+        Provide EITHER `selector` OR `text`, but not both.
+        Args:
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            contains: How to determine if elements are inside: 'all' (fully inside),
+                     'any' (any overlap), or 'center' (center point inside).
+                     (default: "all")
+            apply_exclusions: Whether to apply exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional parameters for element filtering.
+        Returns:
+            A new ElementCollection containing all matching sub-elements from all elements
+            in this collection.
+        """
+        if selector is None and text is None:
+            raise ValueError("Either 'selector' or 'text' must be provided to find_all.")
+        if selector is not None and text is not None:
+            raise ValueError("Provide either 'selector' or 'text' to find_all, not both.")
+        all_found_elements: List[Element] = []
+        for element in self._elements:
+            if hasattr(element, "find_all") and callable(element.find_all):
+                # Element.find_all returns an ElementCollection
+                found_in_element: "ElementCollection" = element.find_all(
+                    selector=selector,
+                    text=text,
+                    contains=contains,
+                    apply_exclusions=apply_exclusions,
+                    regex=regex,
+                    case=case,
+                    **kwargs,
+                )
+                if found_in_element and found_in_element.elements:
+                    all_found_elements.extend(found_in_element.elements)
+            # else:
+            # Elements in the collection are expected to support find_all.
+            # If an element type doesn't, an AttributeError will naturally occur,
+            # or a more specific check/handling could be added here if needed.
+        return ElementCollection(all_found_elements)
     def extract_each_text(self, **kwargs) -> List[str]:
         """
         Extract text from each element in this region.
@@ -1496,13 +1588,162 @@ class ElementCollection(
         return all_data
+    def to_text_elements(
+        self,
+        text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
+        source_label: str = "derived_from_region",
+        object_type: str = "word",
+        default_font_size: float = 10.0,
+        default_font_name: str = "RegionContent",
+        confidence: Optional[float] = None,
+        add_to_page: bool = False # Default is False
+    ) -> "ElementCollection[TextElement]":
+        """
+        Converts each Region in this collection to a TextElement.
+        Args:
+            text_content_func: A callable that takes a Region and returns its text
+                               (or None). If None, all created TextElements will
+                               have text=None.
+            source_label: The 'source' attribute for the new TextElements.
+            object_type: The 'object_type' for the TextElement's data dict.
+            default_font_size: Placeholder font size.
+            default_font_name: Placeholder font name.
+            confidence: Confidence score.
+            add_to_page: If True (default is False), also adds the created
+                         TextElements to their respective page's element manager.
+        Returns:
+            A new ElementCollection containing the created TextElement objects.
+        """
+        from natural_pdf.elements.region import Region # Local import for type checking if needed or to resolve circularity
+        from natural_pdf.elements.text import TextElement # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
+        new_text_elements: List["TextElement"] = []
+        if not self.elements: # Accesses self._elements via property
+            return ElementCollection([])
+        page_context_for_adding: Optional["Page"] = None
+        if add_to_page:
+            # Try to determine a consistent page context if adding elements
+            first_valid_region_with_page = next(
+                (el for el in self.elements if isinstance(el, Region) and hasattr(el, 'page') and el.page is not None),
+                None
+            )
+            if first_valid_region_with_page:
+                page_context_for_adding = first_valid_region_with_page.page
+            else:
+                logger.warning("Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None.")
+                add_to_page = False # Disable adding if no valid page context can be determined
+        for element in self.elements: # Accesses self._elements via property/iterator
+            if isinstance(element, Region):
+                text_el = element.to_text_element(
+                    text_content=text_content_func,
+                    source_label=source_label,
+                    object_type=object_type,
+                    default_font_size=default_font_size,
+                    default_font_name=default_font_name,
+                    confidence=confidence
+                )
+                new_text_elements.append(text_el)
+                if add_to_page:
+                    if not hasattr(text_el, 'page') or text_el.page is None:
+                        logger.warning(f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page.")
+                        continue
+                    if page_context_for_adding and text_el.page == page_context_for_adding:
+                        if hasattr(page_context_for_adding, '_element_mgr') and page_context_for_adding._element_mgr is not None:
+                            add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
+                            page_context_for_adding._element_mgr.add_element(text_el, element_type=add_as_type)
+                        else:
+                            page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
+                            logger.error(f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement.")
+                    elif page_context_for_adding and text_el.page != page_context_for_adding:
+                        current_page_num_str = str(text_el.page.page_number) if hasattr(text_el.page, 'page_number') else "Unknown"
+                        context_page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else "N/A"
+                        logger.warning(f"TextElement for region {element.bbox} from page {current_page_num_str} "
+                                       f"not added as it's different from collection's inferred page context {context_page_num_str}.")
+                    elif not page_context_for_adding:
+                        logger.warning(f"TextElement for region {element.bbox} created, but no page context was determined for adding.")
+            else:
+                logger.warning(f"Skipping element {type(element)}, not a Region.")
+        if add_to_page and page_context_for_adding:
+            page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
+            logger.info(f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}.")
+        elif add_to_page and not page_context_for_adding:
+             logger.info(f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent.")
+        else: # add_to_page is False
+            logger.info(f"Created {len(new_text_elements)} TextElements (not added to page).")
+        return ElementCollection(new_text_elements)
+    def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, show_progress: bool = True) -> "ElementCollection":
+        """
+        Trim visual whitespace from each region in the collection.
+        Applies the trim() method to each element in the collection,
+        returning a new collection with the trimmed regions.
+        Args:
+            padding: Number of pixels to keep as padding after trimming (default: 1)
+            threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
+            resolution: Resolution for image rendering in DPI (default: 150)
+            show_progress: Whether to show a progress bar for the trimming operation
+        Returns:
+            New ElementCollection with trimmed regions
+        """
+        return self.apply(
+            lambda element: element.trim(padding=padding, threshold=threshold, resolution=resolution),
+            show_progress=show_progress
+        )
+    def clip(
+        self,
+        obj: Optional[Any] = None,
+        left: Optional[float] = None,
+        top: Optional[float] = None,
+        right: Optional[float] = None,
+        bottom: Optional[float] = None,
+    ) -> "ElementCollection":
+        """
+        Clip each element in the collection to the specified bounds.
+        This method applies the clip operation to each individual element,
+        returning a new collection with the clipped elements.
+        Args:
+            obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
+            left: Optional left boundary (x0) to clip to
+            top: Optional top boundary to clip to
+            right: Optional right boundary (x1) to clip to
+            bottom: Optional bottom boundary to clip to
+        Returns:
+            New ElementCollection containing the clipped elements
+        Examples:
+            # Clip each element to another region's bounds
+            clipped_elements = collection.clip(container_region)
+            # Clip each element to specific coordinates
+            clipped_elements = collection.clip(left=100, right=400)
+            # Mix object bounds with specific overrides
+            clipped_elements = collection.clip(obj=container, bottom=page.height/2)
+        """
+        return self.apply(
+            lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
+        )
-class PageCollection(Generic[P], ApplyMixin):
-    """
-    A collection of PDF pages with cross-page operations.
-    This class provides methods for working with multiple pages, such as finding
-    elements across pages, extracting text from page ranges, and more.
+class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
+    """
+    Represents a collection of Page objects, often from a single PDF document.
+    Provides methods for batch operations on these pages.
     """
     def __init__(self, pages: List[P]):
@@ -1633,6 +1874,7 @@ class PageCollection(Generic[P], ApplyMixin):
         self,
         *,
         text: str,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -1644,6 +1886,7 @@ class PageCollection(Generic[P], ApplyMixin):
         self,
         selector: str,
         *,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -1655,6 +1898,7 @@ class PageCollection(Generic[P], ApplyMixin):
         selector: Optional[str] = None,
         *,
         text: Optional[str] = None,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -1668,6 +1912,9 @@ class PageCollection(Generic[P], ApplyMixin):
         Args:
             selector: CSS-like selector string.
             text: Text content to search for (equivalent to 'text:contains(...)').
+            contains: How to determine if elements are inside: 'all' (fully inside),
+                     'any' (any overlap), or 'center' (center point inside).
+                     (default: "all")
             apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
             regex: Whether to use regex for text search (`selector` or `text`) (default: False).
             case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1681,6 +1928,7 @@ class PageCollection(Generic[P], ApplyMixin):
             element = page.find(
                 selector=selector,
                 text=text,
+                contains=contains,
                 apply_exclusions=apply_exclusions,
                 regex=regex,
                 case=case,
@@ -1695,6 +1943,7 @@ class PageCollection(Generic[P], ApplyMixin):
         self,
         *,
         text: str,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -1706,6 +1955,7 @@ class PageCollection(Generic[P], ApplyMixin):
         self,
         selector: str,
         *,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -1717,6 +1967,7 @@ class PageCollection(Generic[P], ApplyMixin):
         selector: Optional[str] = None,
         *,
         text: Optional[str] = None,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -1730,6 +1981,9 @@ class PageCollection(Generic[P], ApplyMixin):
         Args:
             selector: CSS-like selector string.
             text: Text content to search for (equivalent to 'text:contains(...)').
+            contains: How to determine if elements are inside: 'all' (fully inside),
+                     'any' (any overlap), or 'center' (center point inside).
+                     (default: "all")
             apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
             regex: Whether to use regex for text search (`selector` or `text`) (default: False).
             case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1744,6 +1998,7 @@ class PageCollection(Generic[P], ApplyMixin):
             elements = page.find_all(
                 selector=selector,
                 text=text,
+                contains=contains,
                 apply_exclusions=apply_exclusions,
                 regex=regex,
                 case=case,
@@ -1817,7 +2072,7 @@ class PageCollection(Generic[P], ApplyMixin):
         end_elements=None,
         new_section_on_page_break=False,
         boundary_inclusion="both",
-    ) -> List["Region"]:
+    ) -> "ElementCollection[Region]":
         """
         Extract sections from a page collection based on start/end elements.
@@ -2110,7 +2365,7 @@ class PageCollection(Generic[P], ApplyMixin):
                 region.start_element = start_element
                 sections.append(region)
-        return sections
+        return ElementCollection(sections)
     def _gather_analysis_data(
         self,
@@ -2314,8 +2569,10 @@ class PageCollection(Generic[P], ApplyMixin):
         try:
             from PIL import Image, ImageDraw, ImageFont
         except ImportError:
-             logger.error("Pillow library not found, required for to_image(). Install with 'pip install Pillow'")
-             return None
+            logger.error(
+                "Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
+            )
+            return None
         if not self.pages:
             logger.warning("Cannot generate image for empty PageCollection")
@@ -2334,27 +2591,34 @@ class PageCollection(Generic[P], ApplyMixin):
                 try:
                     font = ImageFont.load_default(16)
                 except IOError:
-                     logger.warning("Default font not found. Labels cannot be added.")
-                     add_labels = False # Disable if no font
+                    logger.warning("Default font not found. Labels cannot be added.")
+                    add_labels = False  # Disable if no font
         # Render individual page images
         page_images = []
         for page in pages_to_render:
             try:
                 # Assume page.to_image returns a PIL Image or None
-                img = page.to_image(width=page_width, include_highlights=True) # Render with highlights for visual context
+                img = page.to_image(
+                    width=page_width, include_highlights=True
+                )  # Render with highlights for visual context
                 if img is None:
-                     logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
-                     continue
+                    logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
+                    continue
             except Exception as img_err:
-                 logger.error(f"Error generating image for page {page.number}: {img_err}", exc_info=True)
-                 continue
+                logger.error(
+                    f"Error generating image for page {page.number}: {img_err}", exc_info=True
+                )
+                continue
             # Add page number label
             if add_labels and font:
                 draw = ImageDraw.Draw(img)
-                pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path") else ""
+                pdf_name = (
+                    Path(page.pdf.path).stem
+                    if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
+                    else ""
+                )
                 label_text = f"p{page.number}"
                 if pdf_name:
                     label_text += f" - {pdf_name}"
@@ -2364,43 +2628,65 @@ class PageCollection(Generic[P], ApplyMixin):
                     # Placeholder logic - adjust based on how classification results are stored
                     category = None
                     confidence = None
-                    if hasattr(page, 'analyses') and page.analyses and 'classification' in page.analyses:
-                        result = page.analyses['classification']
+                    if (
+                        hasattr(page, "analyses")
+                        and page.analyses
+                        and "classification" in page.analyses
+                    ):
+                        result = page.analyses["classification"]
                         # Adapt based on actual structure of classification result
-                        category = getattr(result, 'label', None) or result.get('label', None) if isinstance(result, dict) else None
-                        confidence = getattr(result, 'score', None) or result.get('score', None) if isinstance(result, dict) else None
+                        category = (
+                            getattr(result, "label", None) or result.get("label", None)
+                            if isinstance(result, dict)
+                            else None
+                        )
+                        confidence = (
+                            getattr(result, "score", None) or result.get("score", None)
+                            if isinstance(result, dict)
+                            else None
+                        )
                     if category is not None and confidence is not None:
-                         try:
-                            category_str = f"{category} ({confidence:.2f})" # Format confidence
+                        try:
+                            category_str = f"{category} ({confidence:.2f})"  # Format confidence
                             label_text += f"\\n{category_str}"
-                         except (TypeError, ValueError): pass # Ignore formatting errors
+                        except (TypeError, ValueError):
+                            pass  # Ignore formatting errors
                 # Calculate bounding box for multi-line text and draw background/text
                 try:
                     # Using textbbox for potentially better accuracy with specific fonts
                     # Note: textbbox needs Pillow 8+
-                    bbox = draw.textbbox((5, 5), label_text, font=font, spacing=2) # Use textbbox if available
-                    bg_rect = (max(0, bbox[0] - 2), max(0, bbox[1] - 2),
-                               min(img.width, bbox[2] + 2), min(img.height, bbox[3] + 2))
+                    bbox = draw.textbbox(
+                        (5, 5), label_text, font=font, spacing=2
+                    )  # Use textbbox if available
+                    bg_rect = (
+                        max(0, bbox[0] - 2),
+                        max(0, bbox[1] - 2),
+                        min(img.width, bbox[2] + 2),
+                        min(img.height, bbox[3] + 2),
+                    )
                     # Draw semi-transparent background
-                    overlay = Image.new('RGBA', img.size, (255, 255, 255, 0))
+                    overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
                     draw_overlay = ImageDraw.Draw(overlay)
-                    draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
-                    img = Image.alpha_composite(img.convert('RGBA'), overlay).convert('RGB')
-                    draw = ImageDraw.Draw(img) # Recreate draw object
+                    draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180))  # White with alpha
+                    img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
+                    draw = ImageDraw.Draw(img)  # Recreate draw object
                     # Draw the potentially multi-line text
                     draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
-                except AttributeError: # Fallback for older Pillow without textbbox
+                except AttributeError:  # Fallback for older Pillow without textbbox
                     # Approximate size and draw
                     # This might not be perfectly aligned
-                     draw.rectangle((2, 2, 150, 40), fill=(255, 255, 255, 180)) # Simple fixed background
-                     draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
+                    draw.rectangle(
+                        (2, 2, 150, 40), fill=(255, 255, 255, 180)
+                    )  # Simple fixed background
+                    draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
                 except Exception as draw_err:
-                     logger.error(f"Error drawing label on page {page.number}: {draw_err}", exc_info=True)
+                    logger.error(
+                        f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
+                    )
             page_images.append(img)
@@ -2408,7 +2694,6 @@ class PageCollection(Generic[P], ApplyMixin):
             logger.warning("No page images were successfully rendered for the grid.")
             return None
         # Calculate grid dimensions if not provided
         num_images = len(page_images)
         if not rows and not cols:
@@ -2418,24 +2703,23 @@ class PageCollection(Generic[P], ApplyMixin):
             cols = (num_images + rows - 1) // rows
         elif cols and not rows:
             rows = (num_images + cols - 1) // cols
-        cols = max(1, cols if cols else 1) # Ensure at least 1
+        cols = max(1, cols if cols else 1)  # Ensure at least 1
         rows = max(1, rows if rows else 1)
         # Get maximum dimensions for consistent grid cells
         max_width = max(img.width for img in page_images) if page_images else 1
         max_height = max(img.height for img in page_images) if page_images else 1
         # Create grid image
         grid_width = cols * max_width + (cols + 1) * spacing
         grid_height = rows * max_height + (rows + 1) * spacing
-        grid_img = Image.new("RGB", (grid_width, grid_height), (220, 220, 220)) # Lighter gray background
+        grid_img = Image.new(
+            "RGB", (grid_width, grid_height), (220, 220, 220)
+        )  # Lighter gray background
         # Place images in grid
         for i, img in enumerate(page_images):
-            if i >= rows * cols: # Ensure we don't exceed grid capacity
+            if i >= rows * cols:  # Ensure we don't exceed grid capacity
                 break
             row = i // cols
@@ -2484,8 +2768,8 @@ class PageCollection(Generic[P], ApplyMixin):
         if not self.pages:
             raise ValueError("Cannot save an empty PageCollection.")
-        if not (ocr ^ original): # XOR: exactly one must be true
-             raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
+        if not (ocr ^ original):  # XOR: exactly one must be true
+            raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
         output_path_obj = Path(output_path)
         output_path_str = str(output_path_obj)
@@ -2494,18 +2778,29 @@ class PageCollection(Generic[P], ApplyMixin):
             if create_searchable_pdf is None:
                 raise ImportError(
                     "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
-                    "Install with: pip install \\\"natural-pdf[ocr-export]\\\"" # Escaped quotes
+                    'Install with: pip install \\"natural-pdf[ocr-export]\\"'  # Escaped quotes
                 )
             # Check for non-OCR vector elements (provide a warning)
             has_vector_elements = False
             for page in self.pages:
                 # Simplified check for common vector types or non-OCR chars/words
-                if (hasattr(page, 'rects') and page.rects or
-                    hasattr(page, 'lines') and page.lines or
-                    hasattr(page, 'curves') and page.curves or
-                    (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
-                    (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
+                if (
+                    hasattr(page, "rects")
+                    and page.rects
+                    or hasattr(page, "lines")
+                    and page.lines
+                    or hasattr(page, "curves")
+                    and page.curves
+                    or (
+                        hasattr(page, "chars")
+                        and any(getattr(el, "source", None) != "ocr" for el in page.chars)
+                    )
+                    or (
+                        hasattr(page, "words")
+                        and any(getattr(el, "source", None) != "ocr" for el in page.words)
+                    )
+                ):
                     has_vector_elements = True
                     break
             if has_vector_elements:
@@ -2532,22 +2827,22 @@ class PageCollection(Generic[P], ApplyMixin):
             if create_original_pdf is None:
                 raise ImportError(
                     "Saving with original=True requires 'pikepdf'. "
-                    "Install with: pip install \\\"natural-pdf[ocr-export]\\\"" # Escaped quotes
+                    'Install with: pip install \\"natural-pdf[ocr-export]\\"'  # Escaped quotes
                 )
             # Check for OCR elements (provide a warning) - keep this check here
             has_ocr_elements = False
             for page in self.pages:
-                 # Use find_all which returns a collection; check if it's non-empty
-                 if hasattr(page, 'find_all'):
-                     ocr_text_elements = page.find_all("text[source=ocr]")
-                     if ocr_text_elements: # Check truthiness of collection
-                         has_ocr_elements = True
-                         break
-                 elif hasattr(page, 'words'): # Fallback check if find_all isn't present?
-                     if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
-                          has_ocr_elements = True
-                          break
+                # Use find_all which returns a collection; check if it's non-empty
+                if hasattr(page, "find_all"):
+                    ocr_text_elements = page.find_all("text[source=ocr]")
+                    if ocr_text_elements:  # Check truthiness of collection
+                        has_ocr_elements = True
+                        break
+                elif hasattr(page, "words"):  # Fallback check if find_all isn't present?
+                    if any(getattr(el, "source", None) == "ocr" for el in page.words):
+                        has_ocr_elements = True
+                        break
             if has_ocr_elements:
                 logger.warning(
@@ -2565,5 +2860,5 @@ class PageCollection(Generic[P], ApplyMixin):
             except Exception as e:
                 # Error logging is handled within create_original_pdf
                 # Re-raise the exception caught from the exporter
-                raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
+                raise e  # Keep the original exception type (ValueError, RuntimeError, etc.)
             # <--- END MODIFIED

natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

natural-pdf 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl