PyPI - natural-pdf - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

natural-pdf 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

natural_pdf/__init__.py +1 -1
natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
natural_pdf/analyzers/layout/layout_manager.py +9 -6
natural_pdf/analyzers/layout/layout_options.py +2 -4
natural_pdf/analyzers/layout/surya.py +199 -91
natural_pdf/core/highlighting_service.py +48 -17
natural_pdf/core/page.py +92 -27
natural_pdf/core/pdf.py +11 -0
natural_pdf/elements/base.py +99 -14
natural_pdf/elements/collections.py +56 -0
natural_pdf/elements/region.py +56 -131
natural_pdf/qa/document_qa.py +4 -3
natural_pdf/selectors/parser.py +215 -1
natural_pdf/utils/visualization.py +2 -2
natural_pdf-0.1.2.dist-info/METADATA +124 -0
{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/RECORD +19 -19
natural_pdf-0.1.0.dist-info/METADATA +0 -295
{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -761,8 +761,6 @@ class Region(DirectionalMixin):
             exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
             if debug:
-                import logging
-                logger = logging.getLogger("natural_pdf.elements.region")
                 logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
         # IMPROVEMENT 1: Check if the region intersects with any exclusion zone
@@ -777,16 +775,12 @@ class Region(DirectionalMixin):
                 if overlap:
                     has_intersection = True
                     if debug:
-                        import logging
-                        logger = logging.getLogger("natural_pdf.elements.region")
                         logger.debug(f"  Region intersects with exclusion {i}: {exclusion.bbox}")
                     break
             # If no intersection, process without exclusions
             if not has_intersection:
                 if debug:
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"  No intersection with any exclusion, ignoring exclusions")
                 apply_exclusions = False
                 exclusion_regions = []
@@ -809,8 +803,6 @@ class Region(DirectionalMixin):
                              abs(exclusion.x1 - self.page.width) < 5)
                 if debug:
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"  Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
                 if full_width:
@@ -827,8 +819,6 @@ class Region(DirectionalMixin):
                 bottom_bound = self.bottom
                 if debug:
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"  Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
                 # Process only header/footer exclusions for cropping
@@ -838,8 +828,6 @@ class Region(DirectionalMixin):
                         # Move top bound to exclude the header
                         top_bound = max(top_bound, exclusion.bottom)
                         if debug:
-                            import logging
-                            logger = logging.getLogger("natural_pdf.elements.region")
                             logger.debug(f"  Adjusted top bound to {top_bound} due to header exclusion")
                     # If exclusion is at the bottom of our region
@@ -847,14 +835,10 @@ class Region(DirectionalMixin):
                         # Move bottom bound to exclude the footer
                         bottom_bound = min(bottom_bound, exclusion.top)
                         if debug:
-                            import logging
-                            logger = logging.getLogger("natural_pdf.elements.region")
                             logger.debug(f"  Adjusted bottom bound to {bottom_bound} due to footer exclusion")
                 if debug:
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"  Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
                 # If we still have a valid region after exclusions
@@ -865,8 +849,6 @@ class Region(DirectionalMixin):
                     result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
                     if debug:
-                        import logging
-                        logger = logging.getLogger("natural_pdf.elements.region")
                         logger.debug(f"  Successfully extracted text using crop, got {len(result)} characters")
                     # Skip the complex filtering approach
@@ -874,16 +856,12 @@ class Region(DirectionalMixin):
                 else:
                     # This would only happen if the region is entirely inside an exclusion zone
                     # or if both top and bottom of the region are excluded leaving no valid area
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
                     return ""
             # We have exclusions, but not all are headers/footers,
             # or we have a non-rectangular region
             else:
                 if debug:
-                    import logging
-                    logger = logging.getLogger("natural_pdf.elements.region")
                     logger.debug(f"  Mixed exclusion types or non-rectangular region, switching to filtering")
                 # Don't use crop for mixed exclusion types
@@ -902,16 +880,13 @@ class Region(DirectionalMixin):
             return result
         # For all other cases (complex exclusions, polygons), we use element filtering
-        import warnings
-        import logging
-        logger = logging.getLogger("natural_pdf.elements.region")
         if debug:
             logger.debug(f"Using element filtering approach for region {self.bbox}")
-        # Get all elements in this region first
-        all_elements = self.get_elements(apply_exclusions=False)
+        # Get only word elements in this region first (instead of ALL elements)
+        # This prevents duplication from joining both char and word text
+        all_elements = [e for e in self.page.words if self._is_element_in_region(e)]
         if apply_exclusions and exclusion_regions:
             if debug:
                 logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
@@ -1325,83 +1300,6 @@ class Region(DirectionalMixin):
         return elements
-    def expand(self,
-              left: float = 0,
-              right: float = 0,
-              top_expand: float = 0,  # Renamed to avoid conflict
-              bottom_expand: float = 0,  # Renamed to avoid conflict
-              width_factor: float = 1.0,
-              height_factor: float = 1.0,
-              # Keep original parameter names for backward compatibility
-              top: float = None,
-              bottom: float = None) -> 'Region':
-        """
-        Create a new region expanded from this one.
-        Args:
-            left: Amount to expand left edge
-            right: Amount to expand right edge
-            top_expand: Amount to expand top edge (upward)
-            bottom_expand: Amount to expand bottom edge (downward)
-            width_factor: Factor to multiply width by
-            height_factor: Factor to multiply height by
-            top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
-            bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
-        Returns:
-            New expanded Region
-        """
-        # Start with current coordinates
-        new_x0 = self.x0
-        new_x1 = self.x1
-        new_top = self.top
-        new_bottom = self.bottom
-        # Handle the deprecated parameter names for backward compatibility
-        if top is not None:
-            top_expand = top
-        if bottom is not None:
-            bottom_expand = bottom
-        # Apply absolute expansions first
-        new_x0 -= left
-        new_x1 += right
-        new_top -= top_expand  # Expand upward (decrease top coordinate)
-        new_bottom += bottom_expand  # Expand downward (increase bottom coordinate)
-        # Apply percentage factors if provided
-        if width_factor != 1.0 or height_factor != 1.0:
-            # Current width and height
-            current_width = new_x1 - new_x0
-            current_height = new_bottom - new_top
-            # Calculate new width and height
-            new_width = current_width * width_factor
-            new_height = current_height * height_factor
-            # Calculate width and height differences
-            width_diff = new_width - current_width
-            height_diff = new_height - current_height
-            # Adjust coordinates to maintain center point
-            new_x0 -= width_diff / 2
-            new_x1 += width_diff / 2
-            new_top -= height_diff / 2
-            new_bottom += height_diff / 2
-        # Create new region with expanded bbox
-        new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
-        # Copy multi-page properties if present
-        if self._spans_pages:
-            new_region._spans_pages = True
-            new_region._multi_page_elements = self._multi_page_elements
-            new_region._page_range = self._page_range
-            new_region.start_element = self.start_element
-            new_region.end_element = self.end_element
-        return new_region
     def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
         """
         Get a section between two elements within this region.
@@ -1616,48 +1514,75 @@ class Region(DirectionalMixin):
     def create_cells(self):
         """
-        Create cell regions for a TATR-detected table.
+        Create cell regions for a detected table by intersecting its
+        row and column regions, and add them to the page.
+        Assumes child row and column regions are already present on the page.
         Returns:
-            List of cell regions
+            Self for method chaining.
         """
-        if not (self.region_type == 'table' and self.model == 'tatr'):
-            raise ValueError("Only works for TATR-detected table regions")
+        # Ensure this is called on a table region
+        if self.region_type not in ('table', 'tableofcontents'): # Allow for ToC which might have structure
+            raise ValueError(f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'")
-        # Find rows and columns that belong to this table
-        rows = self.page.find_all(f'region[type=table-row][model=tatr]')
-        columns = self.page.find_all(f'region[type=table-column][model=tatr]')
+        # Find rows and columns associated with this page
+        # Remove the model-specific filter
+        rows = self.page.find_all('region[type=table-row]')
+        columns = self.page.find_all('region[type=table-column]')
-        # Filter to only include those that overlap with this table
+        # Filter to only include those that overlap with this table region
         def is_in_table(element):
-            element_center_x = (element.x0 + element.x1) / 2
-            element_center_y = (element.top + element.bottom) / 2
-            return (self.x0 <= element_center_x <= self.x1 and
-                    self.top <= element_center_y <= self.bottom)
+            # Use a simple overlap check (more robust than just center point)
+            # Check if element's bbox overlaps with self.bbox
+            return (element.x0 < self.x1 and element.x1 > self.x0 and
+                    element.top < self.bottom and element.bottom > self.top)
         table_rows = [r for r in rows if is_in_table(r)]
         table_columns = [c for c in columns if is_in_table(c)]
+        if not table_rows or not table_columns:
+            self._page.logger.warning(f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found.")
+            return self # Return self even if no cells created
         # Sort rows and columns
         table_rows.sort(key=lambda r: r.top)
         table_columns.sort(key=lambda c: c.x0)
-        # Create cells
-        cells = []
+        # Create cells and add them to the page's element manager
+        created_count = 0
         for row in table_rows:
             for column in table_columns:
-                # Create cell region at the intersection
-                cell = self.page.create_region(
-                    column.x0, row.top, column.x1, row.bottom
-                )
-                # Set minimal metadata
-                cell.source = 'derived'
-                cell.region_type = 'table-cell'
-                cell.model = 'tatr'
-                cells.append(cell)
+                # Calculate intersection bbox for the cell
+                cell_x0 = max(row.x0, column.x0)
+                cell_y0 = max(row.top, column.top)
+                cell_x1 = min(row.x1, column.x1)
+                cell_y1 = min(row.bottom, column.bottom)
+                # Only create a cell if the intersection is valid (positive width/height)
+                if cell_x1 > cell_x0 and cell_y1 > cell_y0:
+                    # Create cell region at the intersection
+                    cell = self.page.create_region(
+                        cell_x0, cell_y0, cell_x1, cell_y1
+                    )
+                    # Set metadata
+                    cell.source = 'derived'
+                    cell.region_type = 'table-cell' # Explicitly set type
+                    cell.normalized_type = 'table-cell' # And normalized type
+                    # Inherit model from the parent table region
+                    cell.model = self.model
+                    cell.parent_region = self # Link cell to parent table region
+                    # Add the cell region to the page's element manager
+                    self.page._element_mgr.add_region(cell)
+                    created_count += 1
-        return cells
+        # Optional: Add created cells to the table region's children
+        # self.child_regions.extend(cells_created_in_this_call) # Needs list management
+        self._page.logger.info(f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions.")
+        return self # Return self for chaining
     def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
         """

natural_pdf/qa/document_qa.py CHANGED Viewed

@@ -5,6 +5,7 @@ from PIL import Image, ImageDraw
 import os
 import tempfile
 import json
+from natural_pdf.elements.collections import ElementCollection
 logger = logging.getLogger("natural_pdf.qa.document_qa")
@@ -304,8 +305,8 @@ class DocumentQA:
                             # Remove from matched texts to avoid duplicates
                             if element.text in matched_texts:
                                 matched_texts.remove(element.text)
-                    result["source_elements"] = source_elements
+                    result["source_elements"] = ElementCollection(source_elements)
             return result
@@ -386,7 +387,7 @@ class DocumentQA:
                             if element.text in matched_texts:
                                 matched_texts.remove(element.text)
-                    result["source_elements"] = source_elements
+                    result["source_elements"] = ElementCollection(source_elements)
             return result

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -351,4 +351,218 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
         return abs(value1 - value2) <= tolerance
     # Default to exact match for other types
-    return value1 == value2
+    return value1 == value2
+PSEUDO_CLASS_FUNCTIONS = {
+    'bold': lambda el: hasattr(el, 'bold') and el.bold,
+    'italic': lambda el: hasattr(el, 'italic') and el.italic,
+    'first-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[0] == el, # Example placeholder
+    'last-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[-1] == el, # Example placeholder
+    # Add the new pseudo-classes for negation
+    'not-bold': lambda el: hasattr(el, 'bold') and not el.bold,
+    'not-italic': lambda el: hasattr(el, 'italic') and not el.italic,
+}
+def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
+    """
+    Convert a parsed selector to a filter function.
+    Args:
+        selector: Parsed selector dictionary
+        **kwargs: Additional filter parameters including:
+                 - regex: Whether to use regex for text search
+                 - case: Whether to do case-sensitive text search
+    Returns:
+        Function that takes an element and returns True if it matches
+    """
+    def filter_func(element):
+        # Check element type
+        if selector['type'] != 'any':
+            # Special handling for 'text' type to match both 'text', 'char', and 'word'
+            if selector['type'] == 'text':
+                if element.type not in ['text', 'char', 'word']:
+                    return False
+            # Special handling for 'region' type to check for detected layout regions
+            elif selector['type'] == 'region':
+                # Check if this is a Region with region_type property
+                if not hasattr(element, 'region_type'):
+                    return False
+                # If 'type' attribute specified, it will be checked in the attributes section
+            # Check for Docling-specific types (section-header, etc.)
+            elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
+                # This is a direct match with a Docling region type
+                pass
+            # Otherwise, require exact match with the element's type attribute
+            elif not hasattr(element, 'type') or element.type != selector['type']:
+                return False
+        # Check attributes
+        for name, attr_info in selector['attributes'].items():
+            op = attr_info['op']
+            value = attr_info['value']
+            # Special case for fontname attribute - allow matching part of the name
+            if name == 'fontname' and op == '*=':
+                element_value = getattr(element, name, None)
+                if element_value is None or value.lower() not in element_value.lower():
+                    return False
+                continue
+            # Convert hyphenated attribute names to underscore for Python properties
+            python_name = name.replace('-', '_')
+            # Special case for region attributes
+            if selector['type'] == 'region':
+                if name == 'type':
+                    # Use normalized_type for comparison if available
+                    if hasattr(element, 'normalized_type') and element.normalized_type:
+                        element_value = element.normalized_type
+                    else:
+                        # Convert spaces to hyphens for consistency with the normalized format
+                        element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
+                elif name == 'model':
+                    # Special handling for model attribute in regions
+                    element_value = getattr(element, 'model', None)
+                else:
+                    # Get the attribute value from the element normally
+                    element_value = getattr(element, python_name, None)
+            else:
+                # Get the attribute value from the element normally for non-region elements
+                element_value = getattr(element, python_name, None)
+            if element_value is None:
+                return False
+            # Apply operator
+            if op == '=':
+                if element_value != value:
+                    return False
+            elif op == '~=':
+                # Approximate match (e.g., for colors)
+                if not _is_approximate_match(element_value, value):
+                    return False
+            elif op == '>=':
+                # Greater than or equal (element value must be >= specified value)
+                if not (isinstance(element_value, (int, float)) and
+                        isinstance(value, (int, float)) and
+                        element_value >= value):
+                    return False
+            elif op == '<=':
+                # Less than or equal (element value must be <= specified value)
+                if not (isinstance(element_value, (int, float)) and
+                        isinstance(value, (int, float)) and
+                        element_value <= value):
+                    return False
+            elif op == '>':
+                # Greater than (element value must be > specified value)
+                if not (isinstance(element_value, (int, float)) and
+                        isinstance(value, (int, float)) and
+                        element_value > value):
+                    return False
+            elif op == '<':
+                # Less than (element value must be < specified value)
+                if not (isinstance(element_value, (int, float)) and
+                        isinstance(value, (int, float)) and
+                        element_value < value):
+                    return False
+        # Check pseudo-classes
+        for pseudo in selector['pseudo_classes']:
+            name = pseudo['name']
+            args = pseudo['args']
+            # Handle various pseudo-classes
+            if name == 'contains' and hasattr(element, 'text'):
+                use_regex = kwargs.get('regex', False)
+                ignore_case = not kwargs.get('case', True)
+                if use_regex:
+                    import re
+                    if not element.text:
+                        return False
+                    try:
+                        pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
+                        if not pattern.search(element.text):
+                            return False
+                    except re.error:
+                        # If regex is invalid, fall back to literal text search
+                        element_text = element.text
+                        search_text = args
+                        if ignore_case:
+                            element_text = element_text.lower()
+                            search_text = search_text.lower()
+                        if search_text not in element_text:
+                            return False
+                else:
+                    # String comparison with case sensitivity option
+                    if not element.text:
+                        return False
+                    element_text = element.text
+                    search_text = args
+                    if ignore_case:
+                        element_text = element_text.lower()
+                        search_text = search_text.lower()
+                    if search_text not in element_text:
+                        return False
+            elif name == 'starts-with' and hasattr(element, 'text'):
+                if not element.text or not element.text.startswith(args):
+                    return False
+            elif name == 'ends-with' and hasattr(element, 'text'):
+                if not element.text or not element.text.endswith(args):
+                    return False
+            elif name == 'bold':
+                if not (hasattr(element, 'bold') and element.bold):
+                    return False
+            elif name == 'italic':
+                if not (hasattr(element, 'italic') and element.italic):
+                    return False
+            elif name == 'horizontal':
+                if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
+                    return False
+            elif name == 'vertical':
+                if not (hasattr(element, 'is_vertical') and element.is_vertical):
+                    return False
+            else:
+                # Check pseudo-classes (basic ones like :bold, :italic)
+                if name in PSEUDO_CLASS_FUNCTIONS:
+                    if not PSEUDO_CLASS_FUNCTIONS[name](element):
+                        return False
+                elif name == 'contains':
+                    if not hasattr(element, 'text') or not element.text:
+                        return False
+                    text_to_check = element.text
+                    search_term = args
+                    if not kwargs.get('case', True): # Check case flag from kwargs
+                        text_to_check = text_to_check.lower()
+                        search_term = search_term.lower()
+                    if kwargs.get('regex', False): # Check regex flag from kwargs
+                        try:
+                            if not re.search(search_term, text_to_check):
+                                return False
+                        except re.error as e:
+                             logger.warning(f"Invalid regex in :contains selector '{search_term}': {e}")
+                             return False # Invalid regex cannot match
+                    else:
+                        if search_term not in text_to_check:
+                            return False
+                # Skip complex pseudo-classes like :near, :above here, handled later
+                elif name in ('above', 'below', 'near', 'left-of', 'right-of'):
+                    pass # Handled separately after initial filtering
+                else:
+                     # Optionally log unknown pseudo-classes
+                     # logger.warning(f"Unknown pseudo-class: {name}")
+                     pass
+        return True # Element passes all attribute and simple pseudo-class filters
+    return filter_func

natural_pdf/utils/visualization.py CHANGED Viewed

@@ -127,10 +127,10 @@ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
     # Try to load a font, use default if not available
     try:
         # Use a commonly available font, adjust size
-        font = ImageFont.truetype("DejaVuSans.ttf", 12)
+        font = ImageFont.truetype("DejaVuSans.ttf", 14)
     except IOError:
         try:
-             font = ImageFont.truetype("Arial.ttf", 12)
+             font = ImageFont.truetype("Arial.ttf", 14)
         except IOError:
             font = ImageFont.load_default()

natural-pdf 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

natural-pdf 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl