PyPI - natural-pdf - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

natural-pdf 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

natural_pdf/__init__.py +1 -1
natural_pdf/core/highlighting_service.py +48 -17
natural_pdf/core/page.py +92 -27
natural_pdf/core/pdf.py +11 -0
natural_pdf/elements/base.py +99 -14
natural_pdf/elements/collections.py +56 -0
natural_pdf/elements/region.py +4 -106
natural_pdf/qa/document_qa.py +4 -3
natural_pdf/selectors/parser.py +215 -1
natural_pdf/utils/visualization.py +2 -2
{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/METADATA +10 -10
{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/RECORD +15 -15
{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/top_level.txt +0 -0

natural_pdf/__init__.py CHANGED Viewed

@@ -47,7 +47,7 @@ try:
 except ImportError:
     HAS_QA = False
-__version__ = "0.1.0"
+__version__ = "0.1.1"
 if HAS_QA:
     __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -383,7 +383,7 @@ class HighlightingService:
     def add(
         self,
         page_index: int,
-        bbox: Tuple[float, float, float, float],
+        bbox: Union[Tuple[float, float, float, float], Any], # Relax input type hint
         color: Optional[Union[Tuple, str]] = None,
         label: Optional[str] = None,
         use_color_cycling: bool = False,
@@ -392,9 +392,32 @@ class HighlightingService:
         existing: str = 'append'
     ):
         """Adds a rectangular highlight."""
+        processed_bbox: Tuple[float, float, float, float]
+        # Check if bbox is an object with expected attributes (likely a Region)
+        # Assuming Region object has x0, top, x1, bottom attributes based on error context
+        if (hasattr(bbox, 'x0') and hasattr(bbox, 'top') and
+            hasattr(bbox, 'x1') and hasattr(bbox, 'bottom')):
+             try:
+                # Ensure attributes are numeric before creating tuple
+                processed_bbox = (float(bbox.x0), float(bbox.top), float(bbox.x1), float(bbox.bottom))
+             except (ValueError, TypeError):
+                 logger.error(f"Invalid attribute types in bbox object for page {page_index}: {bbox}. Expected numeric values.")
+                 return
+        elif isinstance(bbox, (list, tuple)) and len(bbox) == 4:
+             try:
+                 # Ensure elements are numeric and convert to tuple
+                 processed_bbox = tuple(float(v) for v in bbox)
+             except (ValueError, TypeError):
+                 logger.error(f"Invalid values in bbox sequence for page {page_index}: {bbox}. Expected numeric values.")
+                 return
+        else:
+            logger.error(f"Invalid bbox type or structure provided for page {page_index}: {type(bbox)} - {bbox}. Expected tuple/list of 4 numbers or Region-like object.")
+            return # Don't proceed if bbox is invalid
         self._add_internal(
             page_index=page_index,
-            bbox=bbox,
+            bbox=processed_bbox, # Use the processed tuple
             polygon=None,
             color_input=color,
             label=label,
@@ -526,6 +549,7 @@ class HighlightingService:
     ) -> Optional[Image.Image]:
         """
         Renders a specific page with its highlights.
+        Legend is now generated based only on highlights present on this page.
         Args:
             page_index: The 0-based index of the page to render.
@@ -545,23 +569,19 @@ class HighlightingService:
             return None
         page = self._pdf[page_index]
-        highlights_on_page = self.get_highlights_for_page(page_index)
+        highlights_on_page = self.get_highlights_for_page(page_index) # This list will be empty if clear_page was called
         # --- Get Base Image ---
         try:
             render_resolution = resolution if resolution is not None else scale * 72
-            # Use the underlying pdfplumber page object for base rendering
             img_object = page._page.to_image(resolution=render_resolution, **kwargs)
-            # Access the PIL image directly
-            base_image = img_object.annotated # .annotated usually holds the PIL Image
+            base_image = img_object.annotated
             if not isinstance(base_image, Image.Image):
-                 # Fallback for different pdfplumber versions/outputs
                  png_data = img_object._repr_png_()
                  if png_data:
                       base_image = Image.open(io.BytesIO(png_data)).convert('RGB')
                  else:
                       raise ValueError("Could not extract base PIL image from pdfplumber.")
-            # Convert to RGBA for compositing
             base_image = base_image.convert('RGBA')
             logger.debug(f"Base image for page {page_index} rendered with resolution {render_resolution}.")
         except Exception as e:
@@ -569,6 +589,7 @@ class HighlightingService:
             return None
         # --- Render Highlights ---
+        rendered_image: Image.Image
         if highlights_on_page:
             renderer = HighlightRenderer(
                 page=page,
@@ -579,21 +600,31 @@ class HighlightingService:
             )
             rendered_image = renderer.render()
         else:
-             # If no highlights, still need to potentially render OCR if requested
              if render_ocr:
+                  # Still render OCR even if no highlights
                   renderer = HighlightRenderer(page, base_image, [], scale, True)
-                  rendered_image = renderer.render() # Will only call _render_ocr_text
+                  rendered_image = renderer.render()
              else:
                   rendered_image = base_image # No highlights, no OCR requested
-        # --- Add Legend ---
+        # --- Add Legend (Based ONLY on this page's highlights) ---
         if labels:
-            label_colors = self.get_labels_and_colors()
-            if label_colors:
-                legend = create_legend(label_colors)
-                rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
-                logger.debug(f"Added legend with {len(label_colors)} labels to page {page_index}.")
+            # CHANGE: Create label_colors map only from highlights_on_page
+            labels_colors_on_page: Dict[str, Tuple[int, int, int, int]] = {}
+            for hl in highlights_on_page:
+                if hl.label and hl.label not in labels_colors_on_page:
+                    labels_colors_on_page[hl.label] = hl.color
+            if labels_colors_on_page: # Only add legend if there are labels on this page
+                legend = create_legend(labels_colors_on_page)
+                if legend: # Ensure create_legend didn't return None
+                     rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
+                     logger.debug(f"Added legend with {len(labels_colors_on_page)} labels for page {page_index}.")
+                else:
+                     logger.debug(f"Legend creation returned None for page {page_index}.")
+            else:
+                 logger.debug(f"No labels found on page {page_index}, skipping legend.")
         return rendered_image
     def render_preview(

natural_pdf/core/page.py CHANGED Viewed

@@ -9,6 +9,7 @@ import io
 import json
 from natural_pdf.elements.collections import ElementCollection
+from natural_pdf.elements.region import Region
 if TYPE_CHECKING:
     import pdfplumber
@@ -17,7 +18,6 @@ if TYPE_CHECKING:
     from natural_pdf.core.highlighting_service import HighlightingService
     from natural_pdf.elements.base import Element
-from natural_pdf.elements.region import Region
 from natural_pdf.elements.text import TextElement
 from natural_pdf.analyzers.layout.layout_manager import LayoutManager
 from natural_pdf.analyzers.layout.layout_options import LayoutOptions
@@ -120,18 +120,50 @@ class Page:
               raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
          return self._parent.highlighter
-    def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region]) -> 'Page':
+    def clear_exclusions(self) -> 'Page':
+        """
+        Clear all exclusions from the page.
+        """
+        self._exclusions = []
+        return self
+    def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any]) -> 'Page':
         """
         Add an exclusion to the page. Text from these regions will be excluded from extraction.
+        Ensures non-callable items are stored as Region objects if possible.
         Args:
-            exclusion_func_or_region: Either a Region object or a function that takes a Page
-                                      and returns a Region to exclude
+            exclusion_func_or_region: Either a callable function returning a Region,
+                                      a Region object, or another object with a valid .bbox attribute.
         Returns:
             Self for method chaining
-        """
-        self._exclusions.append(exclusion_func_or_region)
+        Raises:
+            TypeError: If a non-callable, non-Region object without a valid bbox is provided.
+        """
+        if callable(exclusion_func_or_region):
+            # Store callable functions directly
+            self._exclusions.append(exclusion_func_or_region)
+            logger.debug(f"Page {self.index}: Added callable exclusion: {exclusion_func_or_region}")
+        elif isinstance(exclusion_func_or_region, Region):
+            # Store Region objects directly
+            self._exclusions.append(exclusion_func_or_region)
+            logger.debug(f"Page {self.index}: Added Region exclusion: {exclusion_func_or_region}")
+        elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
+            # Convert objects with a valid bbox to a Region before storing
+            try:
+                bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
+                region_to_add = Region(self, bbox_coords)
+                self._exclusions.append(region_to_add)
+                logger.debug(f"Page {self.index}: Added exclusion converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
+            except (ValueError, TypeError, Exception) as e:
+                # Raise an error if conversion fails
+                raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
+        else:
+            # Reject invalid types
+            raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
         return self
     def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
@@ -190,6 +222,7 @@ class Page:
     def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
         """
         Get all exclusion regions for this page.
+        Assumes self._exclusions contains only callables or Region objects.
         Args:
             include_callable: Whether to evaluate callable exclusion functions
@@ -207,15 +240,14 @@ class Page:
         for i, exclusion in enumerate(self._exclusions):
             # Get exclusion label if it's a tuple from PDF level
             exclusion_label = f"exclusion {i}"
-            original_exclusion = exclusion
-            # Check if it's a tuple from PDF.add_exclusion
+            original_exclusion = exclusion # Keep track for debugging
+            # Check if it's a tuple from PDF.add_exclusion (should still be handled if PDF adds labels)
             if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
-                # This is likely from PDF.add_exclusion with (func, label)
                 exclusion_func, label = exclusion
                 if label:
                     exclusion_label = label
-                exclusion = exclusion_func
+                exclusion = exclusion_func # Use the function part
             # Process callable exclusion functions
             if callable(exclusion) and include_callable:
@@ -224,40 +256,45 @@ class Page:
                     if debug:
                         print(f"  - Evaluating callable {exclusion_label}...")
-                    # Create a temporary copy of exclusions to avoid recursion
-                    original_exclusions = self._exclusions
-                    self._exclusions = []  # Temporarily clear exclusions
+                    # Temporarily clear exclusions to avoid potential recursion if the callable uses exclusions itself
+                    # This might be overly cautious depending on use case, but safer.
+                    temp_original_exclusions = self._exclusions
+                    self._exclusions = []
-                    # Call the function
-                    region = exclusion(self)
+                    # Call the function - Expects it to return a Region or None
+                    region_result = exclusion(self)
                     # Restore exclusions
-                    self._exclusions = original_exclusions
+                    self._exclusions = temp_original_exclusions
-                    if region:
-                        regions.append(region)
+                    if isinstance(region_result, Region):
+                        regions.append(region_result)
                         if debug:
-                            print(f"    ✓ Added region: {region}")
+                            print(f"    ✓ Added region from callable: {region_result}")
+                    elif region_result:
+                         # Log warning if callable returned something other than Region/None
+                         logger.warning(f"Callable exclusion {exclusion_label} returned non-Region object: {type(region_result)}. Skipping.")
+                         if debug:
+                             print(f"    ✗ Callable returned non-Region/None: {type(region_result)}")
                     else:
                         if debug:
-                            print(f"    ✗ Function returned None, no region added")
+                            print(f"    ✗ Callable returned None, no region added")
                 except Exception as e:
-                    error_msg = f"Error in {exclusion_label} for page {self.index}: {e}"
+                    error_msg = f"Error evaluating callable exclusion {exclusion_label} for page {self.index}: {e}"
                     print(error_msg)
-                    # Print more detailed traceback for debugging
                     import traceback
                     print(f"    Traceback: {traceback.format_exc().splitlines()[-3:]}")
-            # Process direct Region objects
-            elif not callable(exclusion):
-                # It's already a Region object
+            # Process direct Region objects (already validated by add_exclusion)
+            elif isinstance(exclusion, Region):
                 regions.append(exclusion)
                 if debug:
                     print(f"  - Added direct region: {exclusion}")
+            # No else needed, add_exclusion should prevent invalid types
         if debug:
-            print(f"Page {self.index}: Found {len(regions)} valid exclusion regions")
+            print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
         return regions
@@ -1178,6 +1215,34 @@ class Page:
         return ElementCollection(detected_regions)
+    def clear_detected_layout_regions(self) -> 'Page':
+        """
+        Removes all regions from this page that were added by layout analysis
+        (i.e., regions where `source` attribute is 'detected').
+        This clears the regions both from the page's internal `_regions['detected']` list
+        and from the ElementManager's internal list of regions.
+        Returns:
+            Self for method chaining.
+        """
+        if not hasattr(self._element_mgr, 'regions') or not hasattr(self._element_mgr, '_elements') or 'regions' not in self._element_mgr._elements:
+             logger.debug(f"Page {self.index}: No regions found in ElementManager, nothing to clear.")
+             self._regions['detected'] = [] # Ensure page's list is also clear
+             return self
+        # Filter ElementManager's list to keep only non-detected regions
+        original_count = len(self._element_mgr.regions)
+        self._element_mgr._elements['regions'] = [r for r in self._element_mgr.regions if getattr(r, 'source', None) != 'detected']
+        new_count = len(self._element_mgr.regions)
+        removed_count = original_count - new_count
+        # Clear the page's specific list of detected regions
+        self._regions['detected'] = []
+        logger.info(f"Page {self.index}: Cleared {removed_count} detected layout regions.")
+        return self
     def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Optional[Region]: # Return Optional
         """
         Get a section between two elements on this page.

natural_pdf/core/pdf.py CHANGED Viewed

@@ -125,6 +125,17 @@ class PDF:
         from natural_pdf.elements.collections import PageCollection
         return PageCollection(self._pages)
+    def clear_exclusions(self) -> 'PDF':
+        """
+        Clear all exclusion functions from the PDF.
+        Returns:
+            Self for method chaining
+        """
+        self._exclusions = []
+        return self
     def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
         """
         Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.

natural_pdf/elements/base.py CHANGED Viewed

@@ -7,7 +7,8 @@ from PIL import Image
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
     from natural_pdf.elements.region import Region
-    from natural_pdf.elements.base import Element, DirectionalMixin
+    from natural_pdf.elements.base import Element
+    from natural_pdf.elements.collections import ElementCollection
 class DirectionalMixin:
@@ -17,7 +18,7 @@ class DirectionalMixin:
     def _direction(self, direction: str, size: Optional[float] = None,
                    cross_size: str = "full", include_element: bool = False,
-                   until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
+                   until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
         """
         Protected helper method to create a region in a specified direction relative to this element/region.
@@ -154,7 +155,7 @@ class DirectionalMixin:
         return result
     def above(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
-             until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
+             until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
         """
         Select region above this element/region.
@@ -180,7 +181,7 @@ class DirectionalMixin:
         )
     def below(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
-              until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
+              until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
         """
         Select region below this element/region.
@@ -206,7 +207,7 @@ class DirectionalMixin:
         )
     def left(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
-             until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
+             until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
         """
         Select region to the left of this element/region.
@@ -232,7 +233,7 @@ class DirectionalMixin:
         )
     def right(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
-              until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
+              until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
         """
         Select region to the right of this element/region.
@@ -257,6 +258,86 @@ class DirectionalMixin:
             **kwargs
         )
+    def expand(self,
+              left: float = 0,
+              right: float = 0,
+              top_expand: float = 0,  # Renamed to avoid conflict
+              bottom_expand: float = 0,  # Renamed to avoid conflict
+              width_factor: float = 1.0,
+              height_factor: float = 1.0,
+              # Keep original parameter names for backward compatibility
+              top: float = None,
+              bottom: float = None) -> 'Region':
+        """
+        Create a new region expanded from this element/region.
+        Args:
+            left: Amount to expand left edge (positive value expands leftwards)
+            right: Amount to expand right edge (positive value expands rightwards)
+            top_expand: Amount to expand top edge (positive value expands upwards)
+            bottom_expand: Amount to expand bottom edge (positive value expands downwards)
+            width_factor: Factor to multiply width by (applied after absolute expansion)
+            height_factor: Factor to multiply height by (applied after absolute expansion)
+            top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
+            bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
+        Returns:
+            New expanded Region object
+        """
+        # Start with current coordinates
+        new_x0 = self.x0
+        new_x1 = self.x1
+        new_top = self.top
+        new_bottom = self.bottom
+        # Handle the deprecated parameter names for backward compatibility
+        if top is not None:
+            top_expand = top
+        if bottom is not None:
+            bottom_expand = bottom
+        # Apply absolute expansions first
+        new_x0 -= left
+        new_x1 += right
+        new_top -= top_expand  # Expand upward (decrease top coordinate)
+        new_bottom += bottom_expand  # Expand downward (increase bottom coordinate)
+        # Apply percentage factors if provided
+        if width_factor != 1.0 or height_factor != 1.0:
+            # Calculate center point *after* absolute expansion
+            center_x = (new_x0 + new_x1) / 2
+            center_y = (new_top + new_bottom) / 2
+            # Calculate current width and height *after* absolute expansion
+            current_width = new_x1 - new_x0
+            current_height = new_bottom - new_top
+            # Calculate new width and height
+            new_width = current_width * width_factor
+            new_height = current_height * height_factor
+            # Adjust coordinates based on the new dimensions, keeping the center
+            new_x0 = center_x - new_width / 2
+            new_x1 = center_x + new_width / 2
+            new_top = center_y - new_height / 2
+            new_bottom = center_y + new_height / 2
+        # Clamp coordinates to page boundaries
+        new_x0 = max(0, new_x0)
+        new_top = max(0, new_top)
+        new_x1 = min(self.page.width, new_x1)
+        new_bottom = min(self.page.height, new_bottom)
+        # Ensure coordinates are valid (x0 <= x1, top <= bottom)
+        if new_x0 > new_x1: new_x0 = new_x1 = (new_x0 + new_x1) / 2
+        if new_top > new_bottom: new_top = new_bottom = (new_top + new_bottom) / 2
+        # Create new region with expanded bbox
+        from natural_pdf.elements.region import Region
+        new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
+        return new_region
 class Element(DirectionalMixin):
     """
@@ -415,7 +496,8 @@ class Element(DirectionalMixin):
             candidates = candidates[:limit] if limit else candidates
             # Find matching elements
-            matches = self.page.filter_elements(candidates, selector, **kwargs)
+            from natural_pdf.elements.collections import ElementCollection
+            matches = ElementCollection(candidates).find_all(selector, **kwargs)
             return matches[0] if matches else None
         elif idx + 1 < len(all_elements):
             # No selector, just return the next element
@@ -449,16 +531,17 @@ class Element(DirectionalMixin):
         # Search for previous matching element
         if selector:
-            # Filter elements before this one
+            # Select elements before this one
             candidates = all_elements[:idx]
-            # Reverse to start from closest to this element
+            # Reverse to search backwards from the current element
             candidates = candidates[::-1]
             # Limit search range for performance
             candidates = candidates[:limit] if limit else candidates
-            # Find matching elements
-            matches = self.page.filter_elements(candidates, selector, **kwargs)
-            return matches[0] if matches else None
+            # Find matching elements using ElementCollection
+            from natural_pdf.elements.collections import ElementCollection
+            matches = ElementCollection(candidates).find_all(selector, **kwargs)
+            return matches[0] if matches else None # find_all returns a collection
         elif idx > 0:
             # No selector, just return the previous element
             return all_elements[idx - 1]
@@ -737,8 +820,9 @@ class Element(DirectionalMixin):
         Returns:
             First matching element or None
         """
-        # Create a temporary region from this element's bounds
         from natural_pdf.elements.region import Region
+        # Create a temporary region from this element's bounds
         temp_region = Region(self.page, self.bbox)
         return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
@@ -755,7 +839,8 @@ class Element(DirectionalMixin):
         Returns:
             ElementCollection with matching elements
         """
-        # Create a temporary region from this element's bounds
         from natural_pdf.elements.region import Region
+        # Create a temporary region from this element's bounds
         temp_region = Region(self.page, self.bbox)
         return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)

natural_pdf/elements/collections.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
 from natural_pdf.ocr import OCROptions
+from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
 logger = logging.getLogger(__name__)
@@ -882,6 +883,61 @@ class ElementCollection(Generic[T]):
              logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
              return None
+    def find_all(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> 'ElementCollection[T]':
+        """
+        Filter elements within this collection matching the selector.
+        Args:
+            selector: CSS-like selector string.
+            regex: Whether to use regex for text search in :contains (default: False).
+            case: Whether to do case-sensitive text search (default: True).
+            **kwargs: Additional filter parameters passed to the selector function.
+        Returns:
+            A new ElementCollection containing only the matching elements from this collection.
+        """
+        if not self._elements:
+            return ElementCollection([])
+        try:
+            selector_obj = parse_selector(selector)
+        except Exception as e:
+            logger.error(f"Error parsing selector '{selector}': {e}")
+            return ElementCollection([]) # Return empty on parse error
+        # Pass regex and case flags to selector function generator
+        kwargs['regex'] = regex
+        kwargs['case'] = case
+        try:
+            filter_func = selector_to_filter_func(selector_obj, **kwargs)
+        except Exception as e:
+            logger.error(f"Error creating filter function for selector '{selector}': {e}")
+            return ElementCollection([]) # Return empty on filter creation error
+        matching_elements = [element for element in self._elements if filter_func(element)]
+        # Note: Unlike Page.find_all, this doesn't re-sort.
+        # Sorting should be done explicitly on the collection if needed.
+        return ElementCollection(matching_elements)
+    def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
+        """
+        Find the first element within this collection matching the selector.
+        Args:
+            selector: CSS-like selector string.
+            regex: Whether to use regex for text search in :contains (default: False).
+            case: Whether to do case-sensitive text search (default: True).
+            **kwargs: Additional filter parameters passed to the selector function.
+        Returns:
+            The first matching element or None.
+        """
+        results = self.find_all(selector, regex=regex, case=case, **kwargs)
+        return results.first
 class PageCollection(Generic[P]):
     """
     A collection of PDF pages with cross-page operations.

natural_pdf/elements/region.py CHANGED Viewed

@@ -761,8 +761,6 @@ class Region(DirectionalMixin):
             exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
             if debug:
-                import logging
-                logger = logging.getLogger("natural_pdf.elements.region")
                 logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
         # IMPROVEMENT 1: Check if the region intersects with any exclusion zone
@@ -777,16 +775,12 @@ class Region(DirectionalMixin):
                 if overlap:
                     has_intersection = True
                     if debug:
-                        import logging
-                        logger = logging.getLogger("natural_pdf.elements.region")
                         logger.debug(f"  Region intersects with exclusion {i}: {exclusion.bbox}")
                     break
             # If no intersection, process without exclusions
             if not has_intersection:
                 if debug:
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"  No intersection with any exclusion, ignoring exclusions")
                 apply_exclusions = False
                 exclusion_regions = []
@@ -809,8 +803,6 @@ class Region(DirectionalMixin):
                              abs(exclusion.x1 - self.page.width) < 5)
                 if debug:
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"  Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
                 if full_width:
@@ -827,8 +819,6 @@ class Region(DirectionalMixin):
                 bottom_bound = self.bottom
                 if debug:
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"  Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
                 # Process only header/footer exclusions for cropping
@@ -838,8 +828,6 @@ class Region(DirectionalMixin):
                         # Move top bound to exclude the header
                         top_bound = max(top_bound, exclusion.bottom)
                         if debug:
-                            import logging
-                            logger = logging.getLogger("natural_pdf.elements.region")
                             logger.debug(f"  Adjusted top bound to {top_bound} due to header exclusion")
                     # If exclusion is at the bottom of our region
@@ -847,14 +835,10 @@ class Region(DirectionalMixin):
                         # Move bottom bound to exclude the footer
                         bottom_bound = min(bottom_bound, exclusion.top)
                         if debug:
-                            import logging
-                            logger = logging.getLogger("natural_pdf.elements.region")
                             logger.debug(f"  Adjusted bottom bound to {bottom_bound} due to footer exclusion")
                 if debug:
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"  Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
                 # If we still have a valid region after exclusions
@@ -865,8 +849,6 @@ class Region(DirectionalMixin):
                     result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
                     if debug:
-                        import logging
-                        logger = logging.getLogger("natural_pdf.elements.region")
                         logger.debug(f"  Successfully extracted text using crop, got {len(result)} characters")
                     # Skip the complex filtering approach
@@ -874,16 +856,12 @@ class Region(DirectionalMixin):
                 else:
                     # This would only happen if the region is entirely inside an exclusion zone
                     # or if both top and bottom of the region are excluded leaving no valid area
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
                     return ""
             # We have exclusions, but not all are headers/footers,
             # or we have a non-rectangular region
             else:
                 if debug:
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"  Mixed exclusion types or non-rectangular region, switching to filtering")
                 # Don't use crop for mixed exclusion types
@@ -902,16 +880,13 @@ class Region(DirectionalMixin):
             return result
         # For all other cases (complex exclusions, polygons), we use element filtering
-        import warnings
-        import logging
-        logger = logging.getLogger("natural_pdf.elements.region")
         if debug:
             logger.debug(f"Using element filtering approach for region {self.bbox}")
-        # Get all elements in this region first
-        all_elements = self.get_elements(apply_exclusions=False)
+        # Get only word elements in this region first (instead of ALL elements)
+        # This prevents duplication from joining both char and word text
+        all_elements = [e for e in self.page.words if self._is_element_in_region(e)]
         if apply_exclusions and exclusion_regions:
             if debug:
                 logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
@@ -1325,83 +1300,6 @@ class Region(DirectionalMixin):
         return elements
-    def expand(self,
-              left: float = 0,
-              right: float = 0,
-              top_expand: float = 0,  # Renamed to avoid conflict
-              bottom_expand: float = 0,  # Renamed to avoid conflict
-              width_factor: float = 1.0,
-              height_factor: float = 1.0,
-              # Keep original parameter names for backward compatibility
-              top: float = None,
-              bottom: float = None) -> 'Region':
-        """
-        Create a new region expanded from this one.
-        Args:
-            left: Amount to expand left edge
-            right: Amount to expand right edge
-            top_expand: Amount to expand top edge (upward)
-            bottom_expand: Amount to expand bottom edge (downward)
-            width_factor: Factor to multiply width by
-            height_factor: Factor to multiply height by
-            top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
-            bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
-        Returns:
-            New expanded Region
-        """
-        # Start with current coordinates
-        new_x0 = self.x0
-        new_x1 = self.x1
-        new_top = self.top
-        new_bottom = self.bottom
-        # Handle the deprecated parameter names for backward compatibility
-        if top is not None:
-            top_expand = top
-        if bottom is not None:
-            bottom_expand = bottom
-        # Apply absolute expansions first
-        new_x0 -= left
-        new_x1 += right
-        new_top -= top_expand  # Expand upward (decrease top coordinate)
-        new_bottom += bottom_expand  # Expand downward (increase bottom coordinate)
-        # Apply percentage factors if provided
-        if width_factor != 1.0 or height_factor != 1.0:
-            # Current width and height
-            current_width = new_x1 - new_x0
-            current_height = new_bottom - new_top
-            # Calculate new width and height
-            new_width = current_width * width_factor
-            new_height = current_height * height_factor
-            # Calculate width and height differences
-            width_diff = new_width - current_width
-            height_diff = new_height - current_height
-            # Adjust coordinates to maintain center point
-            new_x0 -= width_diff / 2
-            new_x1 += width_diff / 2
-            new_top -= height_diff / 2
-            new_bottom += height_diff / 2
-        # Create new region with expanded bbox
-        new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
-        # Copy multi-page properties if present
-        if self._spans_pages:
-            new_region._spans_pages = True
-            new_region._multi_page_elements = self._multi_page_elements
-            new_region._page_range = self._page_range
-            new_region.start_element = self.start_element
-            new_region.end_element = self.end_element
-        return new_region
     def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
         """
         Get a section between two elements within this region.

natural_pdf/qa/document_qa.py CHANGED Viewed

@@ -5,6 +5,7 @@ from PIL import Image, ImageDraw
 import os
 import tempfile
 import json
+from natural_pdf.elements.collections import ElementCollection
 logger = logging.getLogger("natural_pdf.qa.document_qa")
@@ -304,8 +305,8 @@ class DocumentQA:
                             # Remove from matched texts to avoid duplicates
                             if element.text in matched_texts:
                                 matched_texts.remove(element.text)
-                    result["source_elements"] = source_elements
+                    result["source_elements"] = ElementCollection(source_elements)
             return result
@@ -386,7 +387,7 @@ class DocumentQA:
                             if element.text in matched_texts:
                                 matched_texts.remove(element.text)
-                    result["source_elements"] = source_elements
+                    result["source_elements"] = ElementCollection(source_elements)
             return result

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -351,4 +351,218 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
         return abs(value1 - value2) <= tolerance
     # Default to exact match for other types
-    return value1 == value2
+    return value1 == value2
+PSEUDO_CLASS_FUNCTIONS = {
+    'bold': lambda el: hasattr(el, 'bold') and el.bold,
+    'italic': lambda el: hasattr(el, 'italic') and el.italic,
+    'first-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[0] == el, # Example placeholder
+    'last-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[-1] == el, # Example placeholder
+    # Add the new pseudo-classes for negation
+    'not-bold': lambda el: hasattr(el, 'bold') and not el.bold,
+    'not-italic': lambda el: hasattr(el, 'italic') and not el.italic,
+}
+def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
+    """
+    Convert a parsed selector to a filter function.
+    Args:
+        selector: Parsed selector dictionary
+        **kwargs: Additional filter parameters including:
+                 - regex: Whether to use regex for text search
+                 - case: Whether to do case-sensitive text search
+    Returns:
+        Function that takes an element and returns True if it matches
+    """
+    def filter_func(element):
+        # Check element type
+        if selector['type'] != 'any':
+            # Special handling for 'text' type to match both 'text', 'char', and 'word'
+            if selector['type'] == 'text':
+                if element.type not in ['text', 'char', 'word']:
+                    return False
+            # Special handling for 'region' type to check for detected layout regions
+            elif selector['type'] == 'region':
+                # Check if this is a Region with region_type property
+                if not hasattr(element, 'region_type'):
+                    return False
+                # If 'type' attribute specified, it will be checked in the attributes section
+            # Check for Docling-specific types (section-header, etc.)
+            elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
+                # This is a direct match with a Docling region type
+                pass
+            # Otherwise, require exact match with the element's type attribute
+            elif not hasattr(element, 'type') or element.type != selector['type']:
+                return False
+        # Check attributes
+        for name, attr_info in selector['attributes'].items():
+            op = attr_info['op']
+            value = attr_info['value']
+            # Special case for fontname attribute - allow matching part of the name
+            if name == 'fontname' and op == '*=':
+                element_value = getattr(element, name, None)
+                if element_value is None or value.lower() not in element_value.lower():
+                    return False
+                continue
+            # Convert hyphenated attribute names to underscore for Python properties
+            python_name = name.replace('-', '_')
+            # Special case for region attributes
+            if selector['type'] == 'region':
+                if name == 'type':
+                    # Use normalized_type for comparison if available
+                    if hasattr(element, 'normalized_type') and element.normalized_type:
+                        element_value = element.normalized_type
+                    else:
+                        # Convert spaces to hyphens for consistency with the normalized format
+                        element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
+                elif name == 'model':
+                    # Special handling for model attribute in regions
+                    element_value = getattr(element, 'model', None)
+                else:
+                    # Get the attribute value from the element normally
+                    element_value = getattr(element, python_name, None)
+            else:
+                # Get the attribute value from the element normally for non-region elements
+                element_value = getattr(element, python_name, None)
+            if element_value is None:
+                return False
+            # Apply operator
+            if op == '=':
+                if element_value != value:
+                    return False
+            elif op == '~=':
+                # Approximate match (e.g., for colors)
+                if not _is_approximate_match(element_value, value):
+                    return False
+            elif op == '>=':
+                # Greater than or equal (element value must be >= specified value)
+                if not (isinstance(element_value, (int, float)) and
+                        isinstance(value, (int, float)) and
+                        element_value >= value):
+                    return False
+            elif op == '<=':
+                # Less than or equal (element value must be <= specified value)
+                if not (isinstance(element_value, (int, float)) and
+                        isinstance(value, (int, float)) and
+                        element_value <= value):
+                    return False
+            elif op == '>':
+                # Greater than (element value must be > specified value)
+                if not (isinstance(element_value, (int, float)) and
+                        isinstance(value, (int, float)) and
+                        element_value > value):
+                    return False
+            elif op == '<':
+                # Less than (element value must be < specified value)
+                if not (isinstance(element_value, (int, float)) and
+                        isinstance(value, (int, float)) and
+                        element_value < value):
+                    return False
+        # Check pseudo-classes
+        for pseudo in selector['pseudo_classes']:
+            name = pseudo['name']
+            args = pseudo['args']
+            # Handle various pseudo-classes
+            if name == 'contains' and hasattr(element, 'text'):
+                use_regex = kwargs.get('regex', False)
+                ignore_case = not kwargs.get('case', True)
+                if use_regex:
+                    import re
+                    if not element.text:
+                        return False
+                    try:
+                        pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
+                        if not pattern.search(element.text):
+                            return False
+                    except re.error:
+                        # If regex is invalid, fall back to literal text search
+                        element_text = element.text
+                        search_text = args
+                        if ignore_case:
+                            element_text = element_text.lower()
+                            search_text = search_text.lower()
+                        if search_text not in element_text:
+                            return False
+                else:
+                    # String comparison with case sensitivity option
+                    if not element.text:
+                        return False
+                    element_text = element.text
+                    search_text = args
+                    if ignore_case:
+                        element_text = element_text.lower()
+                        search_text = search_text.lower()
+                    if search_text not in element_text:
+                        return False
+            elif name == 'starts-with' and hasattr(element, 'text'):
+                if not element.text or not element.text.startswith(args):
+                    return False
+            elif name == 'ends-with' and hasattr(element, 'text'):
+                if not element.text or not element.text.endswith(args):
+                    return False
+            elif name == 'bold':
+                if not (hasattr(element, 'bold') and element.bold):
+                    return False
+            elif name == 'italic':
+                if not (hasattr(element, 'italic') and element.italic):
+                    return False
+            elif name == 'horizontal':
+                if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
+                    return False
+            elif name == 'vertical':
+                if not (hasattr(element, 'is_vertical') and element.is_vertical):
+                    return False
+            else:
+                # Check pseudo-classes (basic ones like :bold, :italic)
+                if name in PSEUDO_CLASS_FUNCTIONS:
+                    if not PSEUDO_CLASS_FUNCTIONS[name](element):
+                        return False
+                elif name == 'contains':
+                    if not hasattr(element, 'text') or not element.text:
+                        return False
+                    text_to_check = element.text
+                    search_term = args
+                    if not kwargs.get('case', True): # Check case flag from kwargs
+                        text_to_check = text_to_check.lower()
+                        search_term = search_term.lower()
+                    if kwargs.get('regex', False): # Check regex flag from kwargs
+                        try:
+                            if not re.search(search_term, text_to_check):
+                                return False
+                        except re.error as e:
+                             logger.warning(f"Invalid regex in :contains selector '{search_term}': {e}")
+                             return False # Invalid regex cannot match
+                    else:
+                        if search_term not in text_to_check:
+                            return False
+                # Skip complex pseudo-classes like :near, :above here, handled later
+                elif name in ('above', 'below', 'near', 'left-of', 'right-of'):
+                    pass # Handled separately after initial filtering
+                else:
+                     # Optionally log unknown pseudo-classes
+                     # logger.warning(f"Unknown pseudo-class: {name}")
+                     pass
+        return True # Element passes all attribute and simple pseudo-class filters
+    return filter_func

natural_pdf/utils/visualization.py CHANGED Viewed

@@ -127,10 +127,10 @@ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
     # Try to load a font, use default if not available
     try:
         # Use a commonly available font, adjust size
-        font = ImageFont.truetype("DejaVuSans.ttf", 12)
+        font = ImageFont.truetype("DejaVuSans.ttf", 14)
     except IOError:
         try:
-             font = ImageFont.truetype("Arial.ttf", 12)
+             font = ImageFont.truetype("Arial.ttf", 14)
         except IOError:
             font = ImageFont.load_default()

{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.0
+Version: 0.1.1
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT
@@ -48,7 +48,7 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
 Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
 - [Complete documentation here](https://jsoma.github.io/natural-pdf)
-- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
+- [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
 ## Features
@@ -74,18 +74,16 @@ pip install natural-pdf
 # Installs the core library along with required AI dependencies (PyTorch, Transformers)
 ```bash
-# Install with support for specific OCR engines
-pip install natural-pdf[easyocr]   # EasyOCR engine
-pip install natural-pdf[paddle]    # PaddleOCR engine (requires paddlepaddle)
-pip install natural-pdf[surya]     # Surya OCR engine
-# Install with support for YOLO layout detection model
+# Install with support for specific OCR and layout engines
+pip install natural-pdf[easyocr]
+pip install natural-pdf[paddle]
+pip install natural-pdf[surya]
 pip install natural-pdf[layout_yolo]
 # Install with support for the interactive Jupyter widget
 pip install natural-pdf[interactive]
-# Install everything
+# Just install everything
 pip install natural-pdf[all]
 ```
@@ -119,6 +117,8 @@ clean_text = page.extract_text()
 print(clean_text)
 ```
+- [Complete documentation here](https://jsoma.github.io/natural-pdf)
 ## Selectors
 The library supports CSS-like selectors for finding elements:
@@ -185,7 +185,7 @@ Exclusions work efficiently with different region types:
 ## OCR Integration
-Extract text from scanned documents using OCR, with support for multiple engines (EasyOCR, PaddleOCR, Surya):
+Extract text from scanned documents using OCR, with support for multiple engines ([EasyOCR](https://www.jaided.ai/easyocr/), [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html), [Surya](https://github.com/VikParuchuri/surya)):
 ```python
 # Apply OCR using a specific engine (e.g., PaddleOCR)

{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-natural_pdf/__init__.py,sha256=kKHL7SWzk0_ydDDX12X5W3s9-vEKgVYOBubXzp_SCdM,1784
+natural_pdf/__init__.py,sha256=hsSosbPnvDRCfyYAL9bf1haVS6oBxLAl7cbKTWRTHkU,1784
 natural_pdf/analyzers/__init__.py,sha256=BkSmEqw5J76C2fvYHF86EXQJQWWFNIvjSwRMwfW-Ht0,140
 natural_pdf/analyzers/text_options.py,sha256=9IGRoem1O2mc1ZNGiM5-VPRZ3c8LLwEk1B3is9UxMoE,2777
 natural_pdf/analyzers/text_structure.py,sha256=e4G6v0bD7ZJCdo6DcuDD3iZt8KAwBfALMduwZHGh0wI,12415
@@ -15,15 +15,15 @@ natural_pdf/analyzers/layout/tatr.py,sha256=H0Xygk9jA46-vlPleoal94cuDyz-LHTSxVb3
 natural_pdf/analyzers/layout/yolo.py,sha256=NSQK3TcS1qN8D2MDxCvcwTpS_kvzGy3I2LepJDUceoQ,7699
 natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
 natural_pdf/core/element_manager.py,sha256=H1896JSt48ASLSmG22xEXMY-xSKcpYsUlYmYMD48i6Q,17117
-natural_pdf/core/highlighting_service.py,sha256=gcWZnvlscg32anJrh0m3gVgIyrRKTMDHIL5Ft8OOTjA,29454
-natural_pdf/core/page.py,sha256=rKXxdnG4cl8qjRoKEBxXL9ncLWvujDoVWQ9_D9ouHxc,64428
-natural_pdf/core/pdf.py,sha256=VAAe-BU8bcbCTiQ43fp8lsVy8q8KSfN9eAbFp9mJOWw,28296
+natural_pdf/core/highlighting_service.py,sha256=a-40UMohOglYrw4klW1GuQ_p3jZOxnAfPOXPORThr4U,31476
+natural_pdf/core/page.py,sha256=tnxG-5OhFVuFHt0p-a9YSLU-nXjA8fftg5ViQdH5sOU,68512
+natural_pdf/core/pdf.py,sha256=UzxVfVeCnhSN7rxdJresUj_UNFkcFkeaEjLvwZMJS-c,28532
 natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
-natural_pdf/elements/base.py,sha256=Bi6hylE1N252d-GSPZy1mFMvnWh18b9dEGbIRXthq88,32057
-natural_pdf/elements/collections.py,sha256=qkpUZuf08n-NPhCrOE40cRg-T2F5jpba1Xhuo2CKr-c,59982
+natural_pdf/elements/base.py,sha256=9SQ-O2qbQe9Avbf9JI-p6vWlyThZVch-p1yqXWSrBHw,35750
+natural_pdf/elements/collections.py,sha256=RJf4cBZeLfCtfS0-SjzYFRCtbzYjWsgk3LrcTwJAYMs,62392
 natural_pdf/elements/line.py,sha256=QvVdhf_K6rwJkq3q67JmgdZpDhrBgWuSMF-Q25malP4,4783
 natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
-natural_pdf/elements/region.py,sha256=GVenh3ICfojVpSpwKMEayUBBesywowPTTk7y44MLo6g,76835
+natural_pdf/elements/region.py,sha256=MXQK00LLMvwuq94NigeeCVFoGov_RWFe9ZylnIMpzB0,72453
 natural_pdf/elements/text.py,sha256=OAuy0ozaemj6yjMwhXPsJ76VZtRPeJbmrFTzpDJA2_U,11017
 natural_pdf/ocr/__init__.py,sha256=mbUUsCfeU6yRsEqNn3I4Len-XY6FfjfKhTAoWDLA1f4,1943
 natural_pdf/ocr/engine.py,sha256=xDnvhnm4Lr7d83ezglDqOtl9xfx74zOOTyYW-fZHQEQ,4183
@@ -33,20 +33,20 @@ natural_pdf/ocr/engine_surya.py,sha256=gWV_BEuLMqmJcKVlag9i45SsO2uLAtI-dayBm1xbD
 natural_pdf/ocr/ocr_manager.py,sha256=mAyCntdAnrNv8TIvGYlGs40G2tDAdMQ_Jqb3owiPWW8,9934
 natural_pdf/ocr/ocr_options.py,sha256=A2CQV172id-90zMpPZWb8CD09ZP0BuQnnCZGEFP4SaQ,3787
 natural_pdf/qa/__init__.py,sha256=kagdfqNMpTnyzjC2EFy_PBX5us38NnJL548ESSQVzfI,107
-natural_pdf/qa/document_qa.py,sha256=x_AYE0kbs7_4n5NC7zWcxQpHFh0vxP3g3q-l_w4RgSU,15845
+natural_pdf/qa/document_qa.py,sha256=QYKKor0RqUQcEdFEBEUdq7L0ktq1WSMfQ-ynTc64cPU,15926
 natural_pdf/selectors/__init__.py,sha256=Jfk-JBZEpQ7V5FWVGuLJQLH-qOfqNLC2AdicncMhrmY,121
-natural_pdf/selectors/parser.py,sha256=scYuM0Kp-Bidc2KaYwOMiMYSeR-6q970-2Xwy5zsdNE,13784
+natural_pdf/selectors/parser.py,sha256=JK1zDVISACkUhzmzWfQMMW8hvsV422lRBFKgDBWOWC4,24108
 natural_pdf/templates/__init__.py,sha256=i7N8epDxZoDDsK4p2iUiMwzKVs97i_KtNk8ATArqlC4,19
 natural_pdf/templates/ocr_debug.html,sha256=Zy9StzBeHFQU8ity6cjFSZLe3TY0QOabUux4c5WQUzs,19171
 natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
 natural_pdf/utils/highlighting.py,sha256=9H8vbWhwgxzjrL7MhAePXUWZZctLPboNocJzy-1TE_g,675
 natural_pdf/utils/reading_order.py,sha256=1oihH9ZTqQvIVDYc2oVEYqIXyPzi94ERtelp6TyzmWU,7594
-natural_pdf/utils/visualization.py,sha256=p2855QGyRXUFNH8rzgrIVzCSbuf8WXwV_j1YgP518uo,8876
+natural_pdf/utils/visualization.py,sha256=14BM-K4ovDqHniNbxbP_y9KaEYNlkbpELGAv9_8aOZ4,8876
 natural_pdf/widgets/__init__.py,sha256=qckw3DjdVTsASPLJ8uUrGKg3MFhvzHndUpeNGlqwg6A,215
 natural_pdf/widgets/viewer.py,sha256=h_amj_uvf-vRqEsFg4P00fgKxawLAd9jjC1ohUza4BY,37479
 natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
-natural_pdf-0.1.0.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
-natural_pdf-0.1.0.dist-info/METADATA,sha256=_RZKF1mkrpUxpdOO0oMy_HGjj0ZlrvAGIblwmheKCQQ,9960
-natural_pdf-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-natural_pdf-0.1.0.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
-natural_pdf-0.1.0.dist-info/RECORD,,
+natural_pdf-0.1.1.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.1.1.dist-info/METADATA,sha256=8o22GEPtEqlSqexFQxy6tVoHTB35LmT63sjbjbjORRE,10009
+natural_pdf-0.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+natural_pdf-0.1.1.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
+natural_pdf-0.1.1.dist-info/RECORD,,

{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

natural-pdf 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl