PyPI - natural-pdf - Versions diffs - 0.2.3__tar.gz → 0.2.5__tar.gz - Mend

natural-pdf 0.2.3tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

{natural_pdf-0.2.3/natural_pdf.egg-info → natural_pdf-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.2.3
+Version: 0.2.5
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/guides.py RENAMED Viewed

@@ -143,7 +143,7 @@ class GuidesList(UserList):
     def from_content(
         self,
-        markers: Union[str, List[str], "ElementCollection", None],
+        markers: Union[str, List[str], "ElementCollection", Callable, None],
         obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
         align: Literal["left", "right", "center", "between"] = "left",
         outer: bool = True,
@@ -160,6 +160,7 @@ class GuidesList(UserList):
                 - str: single selector (e.g., 'text:contains("Name")') or literal text
                 - List[str]: list of selectors or literal text strings
                 - ElementCollection: collection of elements to extract text from
+                - Callable: function that takes a page and returns markers
                 - None: no markers
             obj: Page/Region/FlowRegion to search (uses parent's context if None)
             align: How to align guides relative to found elements
@@ -174,13 +175,22 @@ class GuidesList(UserList):
         if target_obj is None:
             raise ValueError("No object provided and no context available")
+        # Store callable markers for later evaluation
+        if callable(markers):
+            self._callable = markers
+            # For now, evaluate with the current target object to get initial guides
+            actual_markers = markers(target_obj)
+        else:
+            self._callable = None
+            actual_markers = markers
         # Check if parent is in flow mode
         if self._parent.is_flow_region:
             # Create guides across all constituent regions
             all_guides = []
             for region in self._parent.context.constituent_regions:
                 # Normalize markers for this region
-                marker_texts = _normalize_markers(markers, region)
+                marker_texts = _normalize_markers(actual_markers, region)
                 # Create guides for this region
                 region_guides = Guides.from_content(
@@ -263,7 +273,7 @@ class GuidesList(UserList):
         # Original single-region logic
         # Normalize markers to list of text strings
-        marker_texts = _normalize_markers(markers, target_obj)
+        marker_texts = _normalize_markers(actual_markers, target_obj)
         # Create guides for this axis
         new_guides = Guides.from_content(
@@ -1541,11 +1551,15 @@ class Guides:
         # Add outer guides if requested
         if outer and bounds:
             if axis == "vertical":
-                guides_coords.insert(0, bounds[0])  # x0
-                guides_coords.append(bounds[2])  # x1
+                if outer == True or outer == "first":
+                    guides_coords.insert(0, bounds[0])  # x0
+                if outer == True or outer == "last":
+                    guides_coords.append(bounds[2])  # x1
             else:
-                guides_coords.insert(0, bounds[1])  # y0
-                guides_coords.append(bounds[3])  # y1
+                if outer == True or outer == "first":
+                    guides_coords.insert(0, bounds[1])  # y0
+                if outer == True or outer == "last":
+                    guides_coords.append(bounds[3])  # y1
         # Remove duplicates and sort
         guides_coords = sorted(list(set(guides_coords)))
@@ -3302,7 +3316,7 @@ class Guides:
         markers: Union[str, List[str], "ElementCollection", None] = None,
         obj: Optional[Union["Page", "Region"]] = None,
         align: Literal["left", "right", "center", "between"] = "left",
-        outer: bool = True,
+        outer: Union[str, bool] = True,
         tolerance: float = 5,
         apply_exclusions: bool = True,
     ) -> "Guides":
@@ -3319,7 +3333,10 @@ class Guides:
                 - None: no markers
             obj: Page or Region to search (uses self.context if None)
             align: How to align guides relative to found elements
-            outer: Whether to add outer boundary guides
+            outer: Whether to add outer boundary guides. Can be:
+                - bool: True/False to add/not add both
+                - "first": To add boundary before the first element
+                - "last": To add boundary before the last element
             tolerance: Tolerance for snapping to element edges
             apply_exclusions: Whether to apply exclusion zones when searching for text
@@ -3457,6 +3474,7 @@ class Guides:
         cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
         show_progress: bool = False,
         content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
+        apply_exclusions: bool = True,
         *,
         multi_page: Literal["auto", True, False] = "auto",
     ) -> "TableResult":
@@ -3482,6 +3500,7 @@ class Guides:
             cell_extraction_func: Optional callable for custom cell text extraction
             show_progress: Controls progress bar for text method
             content_filter: Content filtering function or patterns
+            apply_exclusions: Whether to apply exclusion regions during text extraction (default: True)
             multi_page: Controls multi-region table creation for FlowRegions
         Returns:
@@ -3552,6 +3571,7 @@ class Guides:
                 cell_extraction_func=cell_extraction_func,
                 show_progress=show_progress,
                 content_filter=content_filter,
+                apply_exclusions=apply_exclusions,
             )
             return table_result
@@ -3577,6 +3597,162 @@ class Guides:
             except Exception as cleanup_err:
                 logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
+    def extract_table_from_pages(
+        self,
+        pages: Union["PageCollection", List["Page"]],
+        header: Union[str, List[str], None] = "first",
+        skip_repeating_headers: Optional[bool] = None,
+        method: Optional[str] = None,
+        table_settings: Optional[dict] = None,
+        use_ocr: bool = False,
+        ocr_config: Optional[dict] = None,
+        text_options: Optional[Dict] = None,
+        cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
+        show_progress: bool = True,
+        content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
+        apply_exclusions: bool = True,
+    ) -> "TableResult":
+        """
+        Extract tables from multiple pages using this guide pattern.
+        This method applies the guide to each page, extracts tables, and combines
+        them into a single TableResult. Dynamic guides (using lambdas) are evaluated
+        for each page.
+        Args:
+            pages: PageCollection or list of Pages to extract from
+            header: How to handle headers:
+                - "first": Use first row of first page as headers (default)
+                - "all": Expect headers on each page, use from first page
+                - None: No headers, use numeric indices
+                - List[str]: Custom column names
+            skip_repeating_headers: Whether to remove duplicate header rows.
+                Defaults to True when header is "first" or "all", False otherwise.
+            method: Table extraction method (passed to extract_table)
+            table_settings: Settings for pdfplumber table extraction
+            use_ocr: Whether to use OCR for text extraction
+            ocr_config: OCR configuration parameters
+            text_options: Dictionary of options for the 'text' method
+            cell_extraction_func: Optional callable for custom cell text extraction
+            show_progress: Show progress bar for multi-page extraction (default: True)
+            content_filter: Content filtering function or patterns
+            apply_exclusions: Whether to apply exclusion regions during extraction
+        Returns:
+            TableResult: Combined table data from all pages
+        Example:
+            ```python
+            # Create guide with static vertical, dynamic horizontal
+            guide = Guides(pages[0])
+            guide.vertical.from_content(columns, outer="last")
+            guide.horizontal.from_content(lambda p: p.find_all('text:starts-with(NF-)'))
+            # Extract from all pages
+            table_result = guide.extract_table_from_pages(pages, header=columns)
+            df = table_result.to_df()
+            ```
+        """
+        from natural_pdf.core.page_collection import PageCollection
+        from natural_pdf.tables.result import TableResult
+        # Convert to list if it's a PageCollection
+        if isinstance(pages, PageCollection):
+            page_list = list(pages)
+        else:
+            page_list = pages
+        if not page_list:
+            return TableResult([])
+        # Determine header handling
+        if skip_repeating_headers is None:
+            skip_repeating_headers = header in ["first", "all"] or isinstance(header, list)
+        all_rows = []
+        header_row = None
+        # Configure progress bar
+        iterator = page_list
+        if show_progress and len(page_list) > 1:
+            try:
+                from tqdm.auto import tqdm
+                iterator = tqdm(page_list, desc="Extracting tables from pages", unit="page")
+            except ImportError:
+                pass
+        for i, page in enumerate(iterator):
+            # Create a new Guides object for this page
+            page_guide = Guides(page)
+            # Copy vertical guides (usually static)
+            if hasattr(self.vertical, "_callable") and self.vertical._callable is not None:
+                # If vertical is dynamic (lambda), evaluate it
+                page_guide.vertical.from_content(self.vertical._callable(page))
+            else:
+                # Copy static vertical positions
+                page_guide.vertical.data = self.vertical.data.copy()
+            # Handle horizontal guides
+            if hasattr(self.horizontal, "_callable") and self.horizontal._callable is not None:
+                # If horizontal is dynamic (lambda), evaluate it
+                page_guide.horizontal.from_content(self.horizontal._callable(page))
+            else:
+                # Copy static horizontal positions
+                page_guide.horizontal.data = self.horizontal.data.copy()
+            # Extract table from this page
+            table_result = page_guide.extract_table(
+                method=method,
+                table_settings=table_settings,
+                use_ocr=use_ocr,
+                ocr_config=ocr_config,
+                text_options=text_options,
+                cell_extraction_func=cell_extraction_func,
+                show_progress=False,  # Don't show nested progress
+                content_filter=content_filter,
+                apply_exclusions=apply_exclusions,
+            )
+            # Convert to list of rows
+            rows = list(table_result)
+            # Handle headers based on strategy
+            if i == 0:  # First page
+                if header == "first" or header == "all":
+                    # Use first row as header
+                    if rows:
+                        header_row = rows[0]
+                        rows = rows[1:]  # Remove header from data
+                elif isinstance(header, list):
+                    # Custom headers provided
+                    header_row = header
+            else:  # Subsequent pages
+                if header == "all" and skip_repeating_headers and rows:
+                    # Expect and remove header row
+                    if rows and header_row and rows[0] == header_row:
+                        rows = rows[1:]
+                    elif rows:
+                        # Still remove first row if it looks like a header
+                        rows = rows[1:]
+            # Add rows to combined result
+            all_rows.extend(rows)
+        # Create final TableResult
+        if isinstance(header, list):
+            # Custom headers - prepend to data
+            final_result = TableResult(all_rows)
+        elif header_row is not None:
+            # Prepend discovered header
+            final_result = TableResult([header_row] + all_rows)
+        else:
+            # No headers
+            final_result = TableResult(all_rows)
+        return final_result
     def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
         """Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
         if not self.is_flow_region or len(self.context.constituent_regions) < 2:

{natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/element_manager.py RENAMED Viewed

@@ -939,6 +939,11 @@ class ElementManager:
         self.load_elements()
         return self._elements.get("chars", [])
+    def invalidate_cache(self):
+        """Invalidate the cached elements, forcing a reload on next access."""
+        self._elements = None
+        logger.debug(f"Page {self._page.number}: ElementManager cache invalidated")
     @property
     def words(self):
         """Get all word elements."""

{natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/page.py RENAMED Viewed

@@ -78,6 +78,7 @@ from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
 # # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
+from natural_pdf.vision.mixin import VisualSearchMixin
 from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
 # --- End Classification Imports --- #
@@ -101,6 +102,7 @@ class Page(
     ExtractionMixin,
     ShapeDetectionMixin,
     DescribeMixin,
+    VisualSearchMixin,
     Visualizable,
 ):
     """Enhanced Page wrapper built on top of pdfplumber.Page.
@@ -492,6 +494,9 @@ class Page(
                                 exc_info=False,
                             )
                             raise
+            # Invalidate ElementManager cache since exclusions affect element filtering
+            if hasattr(self, "_element_mgr") and self._element_mgr:
+                self._element_mgr.invalidate_cache()
             return self  # Completed processing for selector input
         # ElementCollection -----------------------------------------------
@@ -524,6 +529,9 @@ class Page(
                             exc_info=False,
                         )
                         raise
+            # Invalidate ElementManager cache since exclusions affect element filtering
+            if hasattr(self, "_element_mgr") and self._element_mgr:
+                self._element_mgr.invalidate_cache()
             return self  # Completed processing for ElementCollection input
         # ------------------------------------------------------------------
@@ -616,6 +624,9 @@ class Page(
                             f"Page {self.index}: Failed to convert list item to Region: {e}"
                         )
                         continue
+            # Invalidate ElementManager cache since exclusions affect element filtering
+            if hasattr(self, "_element_mgr") and self._element_mgr:
+                self._element_mgr.invalidate_cache()
             return self
         else:
             # Reject invalid types
@@ -627,6 +638,10 @@ class Page(
         if exclusion_data:
             self._exclusions.append(exclusion_data)
+        # Invalidate ElementManager cache since exclusions affect element filtering
+        if hasattr(self, "_element_mgr") and self._element_mgr:
+            self._element_mgr.invalidate_cache()
         return self
     def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
@@ -697,10 +712,26 @@ class Page(
         """
         regions = []
+        # Combine page-specific exclusions with PDF-level exclusions
+        all_exclusions = list(self._exclusions)  # Start with page-specific
+        # Add PDF-level exclusions if we have a parent PDF
+        if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
+            for pdf_exclusion in self._parent._exclusions:
+                # Check if this exclusion is already in our list (avoid duplicates)
+                if pdf_exclusion not in all_exclusions:
+                    # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
+                    if len(pdf_exclusion) == 2:
+                        # Convert to 3-tuple format with default method
+                        pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
+                    all_exclusions.append(pdf_exclusion)
         if debug:
-            print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
+            print(
+                f"\nPage {self.index}: Evaluating {len(all_exclusions)} exclusions ({len(self._exclusions)} page-specific, {len(all_exclusions) - len(self._exclusions)} from PDF)"
+            )
-        for i, exclusion_data in enumerate(self._exclusions):
+        for i, exclusion_data in enumerate(all_exclusions):
             # Handle both old format (2-tuple) and new format (3-tuple) for backward compatibility
             if len(exclusion_data) == 2:
                 # Old format: (exclusion_item, label)
@@ -1596,7 +1627,14 @@ class Page(
             return ""
         # 2. Apply element-based exclusions if enabled
-        if use_exclusions and self._exclusions:
+        # Check both page-level and PDF-level exclusions
+        has_exclusions = bool(self._exclusions) or (
+            hasattr(self, "_parent")
+            and self._parent
+            and hasattr(self._parent, "_exclusions")
+            and self._parent._exclusions
+        )
+        if use_exclusions and has_exclusions:
             # Filter word elements through _filter_elements_by_exclusions
             # This handles both element-based and region-based exclusions
             word_elements = self._filter_elements_by_exclusions(
@@ -1610,7 +1648,7 @@ class Page(
         # 3. Get region-based exclusions for spatial filtering
         apply_exclusions_flag = kwargs.get("use_exclusions", use_exclusions)
         exclusion_regions = []
-        if apply_exclusions_flag and self._exclusions:
+        if apply_exclusions_flag and has_exclusions:
             exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
             if debug:
                 logger.debug(

{natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/pdf.py RENAMED Viewed

@@ -42,6 +42,7 @@ from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.selectors.parser import parse_selector
 from natural_pdf.text_mixin import TextMixin
 from natural_pdf.utils.locks import pdf_render_lock
+from natural_pdf.vision.mixin import VisualSearchMixin
 if TYPE_CHECKING:
     from natural_pdf.elements.element_collection import ElementCollection
@@ -172,11 +173,26 @@ class _LazyPageList(Sequence):
         """Create and cache a page at the given index within this list."""
         cached = self._cache[index]
         if cached is None:
+            # Get the actual page index in the full PDF
+            actual_page_index = self._indices[index]
+            # First check if this page is already cached in the parent PDF's main page list
+            if (
+                hasattr(self._parent_pdf, "_pages")
+                and hasattr(self._parent_pdf._pages, "_cache")
+                and actual_page_index < len(self._parent_pdf._pages._cache)
+                and self._parent_pdf._pages._cache[actual_page_index] is not None
+            ):
+                # Reuse the already-cached page from the parent PDF
+                # This ensures we get any exclusions that were already applied
+                cached = self._parent_pdf._pages._cache[actual_page_index]
+                self._cache[index] = cached
+                return cached
             # Import here to avoid circular import problems
             from natural_pdf.core.page import Page
-            # Get the actual page index in the full PDF
-            actual_page_index = self._indices[index]
+            # Create new page
             plumber_page = self._plumber_pdf.pages[actual_page_index]
             cached = Page(
                 plumber_page,
@@ -195,6 +211,30 @@ class _LazyPageList(Sequence):
                     except Exception as e:
                         logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
+            # Check if the parent PDF already has a cached page with page-specific exclusions
+            if hasattr(self._parent_pdf, "_pages") and hasattr(self._parent_pdf._pages, "_cache"):
+                parent_cache = self._parent_pdf._pages._cache
+                if (
+                    actual_page_index < len(parent_cache)
+                    and parent_cache[actual_page_index] is not None
+                ):
+                    existing_page = parent_cache[actual_page_index]
+                    # Copy over any page-specific exclusions from the existing page
+                    # Only copy non-callable exclusions (regions/elements) to avoid duplicating PDF-level exclusions
+                    if hasattr(existing_page, "_exclusions") and existing_page._exclusions:
+                        for exclusion_data in existing_page._exclusions:
+                            exclusion_item = exclusion_data[0]
+                            # Skip callable exclusions as they're PDF-level and already applied above
+                            if not callable(exclusion_item):
+                                try:
+                                    cached.add_exclusion(
+                                        *exclusion_data[:2]
+                                    )  # exclusion_item and label
+                                except Exception as e:
+                                    logger.warning(
+                                        f"Failed to copy page-specific exclusion to page {cached.number}: {e}"
+                                    )
             # Apply any stored regions to the newly created page
             if hasattr(self._parent_pdf, "_regions"):
                 for region_data in self._parent_pdf._regions:
@@ -252,7 +292,9 @@ class _LazyPageList(Sequence):
 # --- End Lazy Page List Helper --- #
-class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, Visualizable):
+class PDF(
+    TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, VisualSearchMixin, Visualizable
+):
     """Enhanced PDF wrapper built on top of pdfplumber.
     This class provides a fluent interface for working with PDF documents,

{natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/pdf_collection.py RENAMED Viewed

@@ -40,6 +40,7 @@ logger = logging.getLogger(__name__)
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.region import Region
 from natural_pdf.export.mixin import ExportMixin
+from natural_pdf.vision.mixin import VisualSearchMixin
 # --- Search Imports ---
 try:
@@ -69,8 +70,8 @@ from natural_pdf.search.searchable_mixin import SearchableMixin  # Import the ne
 class PDFCollection(
-    SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin
-):  # Add ExportMixin and ShapeDetectionMixin
+    SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin, VisualSearchMixin
+):
     def __init__(
         self,
         source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -258,8 +259,6 @@ class PDFCollection(
         return iter(self._pdfs)
     def __repr__(self) -> str:
-        # Removed search status
-        return f"<PDFCollection(count={len(self._pdfs)})>"
         return f"<PDFCollection(count={len(self._pdfs)})>"
     @property
@@ -267,6 +266,134 @@ class PDFCollection(
         """Returns the list of PDF objects held by the collection."""
         return self._pdfs
+    def show(self, limit: Optional[int] = 30, per_pdf_limit: Optional[int] = 10, **kwargs):
+        """
+        Display all PDFs in the collection with labels.
+        Each PDF is shown with its pages in a grid layout (6 columns by default),
+        and all PDFs are stacked vertically with labels.
+        Args:
+            limit: Maximum total pages to show across all PDFs (default: 30)
+            per_pdf_limit: Maximum pages to show per PDF (default: 10)
+            **kwargs: Additional arguments passed to each PDF's show() method
+                     (e.g., columns, exclusions, resolution, etc.)
+        Returns:
+            Displayed image in Jupyter or None
+        """
+        if not self._pdfs:
+            print("Empty collection")
+            return None
+        # Import here to avoid circular imports
+        import numpy as np
+        from PIL import Image, ImageDraw, ImageFont
+        # Calculate pages per PDF if total limit is set
+        if limit and not per_pdf_limit:
+            per_pdf_limit = max(1, limit // len(self._pdfs))
+        # Collect images from each PDF
+        all_images = []
+        total_pages_shown = 0
+        for pdf in self._pdfs:
+            if limit and total_pages_shown >= limit:
+                break
+            # Calculate limit for this PDF
+            pdf_limit = per_pdf_limit
+            if limit:
+                remaining = limit - total_pages_shown
+                pdf_limit = min(per_pdf_limit or remaining, remaining)
+            # Get PDF identifier
+            pdf_name = getattr(pdf, "filename", None) or getattr(pdf, "path", "Unknown")
+            if isinstance(pdf_name, Path):
+                pdf_name = pdf_name.name
+            elif "/" in str(pdf_name):
+                pdf_name = str(pdf_name).split("/")[-1]
+            # Render this PDF
+            try:
+                # Get render specs from the PDF
+                render_specs = pdf._get_render_specs(mode="show", max_pages=pdf_limit, **kwargs)
+                if not render_specs:
+                    continue
+                # Get the highlighter and render without displaying
+                highlighter = pdf._get_highlighter()
+                pdf_image = highlighter.unified_render(
+                    specs=render_specs,
+                    layout="grid" if len(render_specs) > 1 else "single",
+                    columns=6,
+                    **kwargs,
+                )
+                if pdf_image:
+                    # Add label above the PDF image
+                    label_height = 40
+                    label_bg_color = (240, 240, 240)
+                    label_text_color = (0, 0, 0)
+                    # Create new image with space for label
+                    width, height = pdf_image.size
+                    labeled_image = Image.new("RGB", (width, height + label_height), "white")
+                    # Draw label background
+                    draw = ImageDraw.Draw(labeled_image)
+                    draw.rectangle([0, 0, width, label_height], fill=label_bg_color)
+                    # Draw label text
+                    try:
+                        # Try to use a nice font if available
+                        font = ImageFont.truetype("Arial", 20)
+                    except:
+                        # Fallback to default font
+                        font = ImageFont.load_default()
+                    label_text = f"{pdf_name} ({len(pdf.pages)} pages)"
+                    draw.text((10, 10), label_text, fill=label_text_color, font=font)
+                    # Paste PDF image below label
+                    labeled_image.paste(pdf_image, (0, label_height))
+                    all_images.append(labeled_image)
+                    total_pages_shown += min(pdf_limit, len(pdf.pages))
+            except Exception as e:
+                logger.warning(f"Failed to render PDF {pdf_name}: {e}")
+                continue
+        if not all_images:
+            print("No PDFs could be rendered")
+            return None
+        # Combine all images vertically
+        if len(all_images) == 1:
+            combined = all_images[0]
+        else:
+            # Add spacing between PDFs
+            spacing = 20
+            total_height = sum(img.height for img in all_images) + spacing * (len(all_images) - 1)
+            max_width = max(img.width for img in all_images)
+            combined = Image.new("RGB", (max_width, total_height), "white")
+            y_offset = 0
+            for i, img in enumerate(all_images):
+                # Center images if they're narrower than max width
+                x_offset = (max_width - img.width) // 2
+                combined.paste(img, (x_offset, y_offset))
+                y_offset += img.height
+                if i < len(all_images) - 1:
+                    y_offset += spacing
+        # Return the combined image (Jupyter will display it automatically)
+        return combined
     @overload
     def find_all(
         self,

{natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/render_spec.py RENAMED Viewed

@@ -186,7 +186,7 @@ class Visualizable:
         color: Optional[Union[str, Tuple[int, int, int]]] = None,
         labels: bool = True,
         label_format: Optional[str] = None,
-        highlights: Optional[List[Dict[str, Any]]] = None,
+        highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
         legend_position: str = "right",
         annotate: Optional[Union[str, List[str]]] = None,
         # Layout options for multi-page/region
@@ -211,7 +211,7 @@ class Visualizable:
             color: Default highlight color
             labels: Whether to show labels for highlights
             label_format: Format string for labels (e.g., "Element {index}")
-            highlights: Additional highlight groups to show
+            highlights: Additional highlight groups to show, or False to disable all highlights
             legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
             annotate: Attribute name(s) to display on highlights (string or list)
             layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)

natural-pdf 0.2.3__tar.gz → 0.2.5__tar.gz

natural-pdf 0.2.3tar.gz → 0.2.5tar.gz